def __call__(self, sequence=Stream.NoNewData): if isinstance(sequence, More): sequence = sequence.output if sequence in [Stream.EndOfStream, Stream.NoNewData]: return sequence data = sequence.data active = data[0] sw = sequence.sliding_window start = sw[0].middle timeline = Timeline() timeline.start = start for i, y in enumerate(data): if active and not y: segment = Segment(start, sw[i].middle) timeline.add(segment) active = False elif not active and y: active = True start = sw[i].middle if active: segment = Segment(start, sw[i].middle) timeline.add(segment) timeline.end = sw[i].middle return timeline
def apply(self, features, segmentation=None): """ Parameters ---------- features : Features segmentation : Timeline, optional """ if segmentation is None: segmentation = Timeline(segments=[features.getExtent()]) sliding_window = features.sliding_window min_samples = sliding_window.durationToSamples(self.min_duration) precision = sliding_window.durationToSamples(self.precision) segmenter = SKLearnBICSegmentation( penalty_coef=self.penalty_coef, covariance_type=self.covariance_type, min_samples=min_samples, precision=precision) result = Timeline() for long_segment in segmentation: X = features.crop(long_segment) boundaries = segmenter.apply(X) for t, T in pairwise(boundaries): segment = sliding_window.rangeToSegment(t, T - t) shifted_segment = Segment(long_segment.start + segment.start, long_segment.start + segment.end) result.add(shifted_segment) return result
def write_test_file(data_dir, output_file, trial_length): annotations, max_length, speakers = read_annotaitons(data_dir) # create an artificial non-overlapping segments each of the trial_length size trial_segments = Timeline() for i in range(0, int(max_length) // trial_length): trial_segments.add(Segment(start=i*trial_length, end=(i+1)*trial_length)) with open(output_file, 'w') as f: for label in speakers.keys(): for annotation in annotations: # make sure our trial segments are not extending beyond the total length of the speech data support = annotation.get_timeline().extent() # we consider smaller segment here to make sure an embedding of 3 seconds can be computed adjusted_trial_segments = trial_segments.crop(Segment(start=support.start, end=support.end - 3.), mode='loose') uri = annotation.uri cur_timeline = annotation.label_timeline(label, copy=False) for trial_segment in adjusted_trial_segments: cropped_speaker = cur_timeline.crop(trial_segment, mode='intersection') if not cropped_speaker: f.write('{0} {1} {2:0>7.2f} {3:0>7.2f} nontarget - -\n'.format( label, uri, trial_segment.start, trial_segment.end)) else: f.write('{0} {1} {2:0>7.2f} {3:0>7.2f} target {4:0>7.2f} {5:0>7.2f}\n'.format( label, uri, trial_segment.start, trial_segment.end, cropped_speaker[0].start, cropped_speaker[0].duration))
def _preprocess(self, reference, hypothesis): if not isinstance(reference, Annotation): raise TypeError('reference must be an instance of `Annotation`') if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # reference where short intra-label gaps are removed filled = Timeline() for label in reference.labels(): label_timeline = reference.label_timeline(label) for gap in label_timeline.gaps(): if gap.duration < self.tolerance: label_timeline.add(gap) for segment in label_timeline.coverage(): filled.add(segment) # reference coverage after filling gaps coverage = filled.coverage() reference_partition = self._partition(filled, coverage) hypothesis_partition = self._partition(hypothesis, coverage) return reference_partition, hypothesis_partition
def _preprocess(self, reference, hypothesis): if not isinstance(reference, Annotation): raise TypeError('reference must be an instance of `Annotation`') if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # reference where short intra-label gaps are removed filled = Timeline() for label in reference.labels(): label_timeline = reference.label_timeline(label) for gap in label_timeline.gaps(): if gap.duration < self.tolerance: label_timeline.add(gap) for segment in label_timeline.support(): filled.add(segment) # reference coverage after filling gaps coverage = filled.support() reference_partition = self._partition(filled, coverage) hypothesis_partition = self._partition(hypothesis, coverage) return reference_partition, hypothesis_partition
def test_remove_and_extent(): t = Timeline(uri='MyAudioFile') t.add(Segment(6, 8)) t.add(Segment(7, 9)) t.add(Segment(6, 9)) t.remove(Segment(6, 9)) assert t.extent() == Segment(6, 9)
def apply(self, predictions, dimension=0): """Peak detection Parameter --------- predictions : SlidingWindowFeature Predictions returned by segmentation approaches. Returns ------- segmentation : Timeline Partition. """ if len(predictions.data.shape) == 1: y = predictions.data elif predictions.data.shape[1] == 1: y = predictions.data[:, 0] else: y = predictions.data[:, dimension] if self.log_scale: y = np.exp(y) sw = predictions.sliding_window precision = sw.step order = max(1, int(np.rint(self.min_duration / precision))) indices = scipy.signal.argrelmax(y, order=order)[0] if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(y) maxi = np.nanmax(y) elif self.scale == 'percentile': mini = np.nanpercentile(y, 1) maxi = np.nanpercentile(y, 99) threshold = mini + self.alpha * (maxi - mini) peak_time = np.array( [sw[i].middle for i in indices if y[i] > threshold]) n_windows = len(y) start_time = sw[0].start end_time = sw[n_windows].end boundaries = np.hstack([[start_time], peak_time, [end_time]]) segmentation = Timeline() for i, (start, end) in enumerate(pairwise(boundaries)): segment = Segment(start, end) segmentation.add(segment) return segmentation
def apply(self, predictions, dimension=0): """Peak detection Parameter --------- predictions : SlidingWindowFeature Predictions returned by segmentation approaches. Returns ------- segmentation : Timeline Partition. """ if len(predictions.data.shape) == 1: y = predictions.data elif predictions.data.shape[1] == 1: y = predictions.data[:, 0] else: y = predictions.data[:, dimension] if self.log_scale: y = np.exp(y) sw = predictions.sliding_window precision = sw.step order = max(1, int(np.rint(self.min_duration / precision))) indices = scipy.signal.argrelmax(y, order=order)[0] if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(data) maxi = np.nanmax(data) elif self.scale == 'percentile': mini = np.nanpercentile(data, 1) maxi = np.nanpercentile(data, 99) threshold = mini + self.alpha * (maxi - mini) peak_time = np.array([sw[i].middle for i in indices if y[i] > threshold]) n_windows = len(y) start_time = sw[0].start end_time = sw[n_windows].end boundaries = np.hstack([[start_time], peak_time, [end_time]]) segmentation = Timeline() for i, (start, end) in enumerate(pairwise(boundaries)): segment = Segment(start, end) segmentation.add(segment) return segmentation
def timeline(): t = Timeline(uri='MyAudioFile') t.add(Segment(6, 8)) t.add(Segment(0.5, 3)) t.add(Segment(8.5, 10)) t.add(Segment(1, 4)) t.add(Segment(5, 7)) t.add(Segment(7, 8)) return t
def run(self): with self.in_subtitles().open('r') as fp: transcription = pyannote.core.json.load(fp) timeline = Timeline() for start, end, edge in transcription.ordered_edges_iter(data=True): if 'subtitle' not in edge: continue segment = Segment(start, end) timeline.add(segment) with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def test_added_empty_segments(): # The first timeline includes empty segments. first_timeline = Timeline() first_timeline.add(Segment(1, 5)) first_timeline.add(Segment(6, 6)) first_timeline.add(Segment(7, 7)) first_timeline.add(Segment(8, 10)) # The second has no empty segments. second_timeline = Timeline() second_timeline.add(Segment(1, 5)) second_timeline.add(Segment(8, 10)) assert first_timeline == second_timeline
def overlap_timeline(uri, annotation): timeline = annotation.get_timeline() segmentation = timeline.segmentation() l_segments = [{'seg': segment, 'count': 0} for segment in segmentation] #print(l_segments) for seg in timeline: for curr in l_segments: if curr['seg'] in seg: curr['count'] += 1 overlap_timeline = Timeline(uri=uri) for curr in l_segments: if curr['count'] > 1: overlap_timeline.add(curr['seg']) return overlap_timeline
def uem_timeline_from_file(uem_file, uniq_name=''): """ outputs pyannote timeline segments for uem file <UEM> file format UNIQ_SPEAKER_ID CHANNEL START_TIME END_TIME """ timeline = Timeline(uri=uniq_name) with open(uem_file, 'r') as f: lines = f.readlines() for line in lines: line = line.strip() speaker_id, channel, start_time, end_time = line.split() timeline.add(Segment(float(start_time), float(end_time))) return timeline
def main(): usage = "%prog [options] RTTMone RTTMtwo" desc = "Convert the txtfile from diarization of the from: \ ID t_in t_out \ into a kaldi format file for spkdet task" version = "%prog 0.1" parser = OptionParser(usage=usage, description=desc, version=version) (opt, args) = parser.parse_args() if (len(args) != 3): parser.error("Incorrect number of arguments") vadrttm, overlaprttm, outputrttm = args # Read document and loaded in memory vad = pyannote.database.util.load_rttm(vadrttm) ovl = pyannote.database.util.load_rttm(overlaprttm) fw = open(outputrttm, 'wt') for name in vad: # Examples # speech = vad['EN2002a.Mix-Headset-0000000-0006000'].get_timeline() # duration = vad['EN2002a.Mix-Headset-0000000-0006000'].get_timeline()[-1][1] # overlap = ovl['EN2002a.Mix-Headset-0000000-0006000'].get_timeline() speech = vad[name].get_timeline() duration = vad[name].get_timeline()[-1][1] if name in ovl.keys(): overlap = ovl[name].get_timeline() # just get the intersections of the VAD and overlap intersection = Timeline() for speech_segment, overlap_segment in speech.co_iter(overlap): intersection.add(speech_segment & overlap_segment) keep = intersection.gaps(support=Segment(0, duration)) vad_without_overlap = speech.crop(keep) else: vad_without_overlap = speech # Write RTTM write_rttm(fw, vad_without_overlap, label='speech') fw.close()
def test_timeline_overlaps(): overlapped_tl = Timeline(uri="La menuiserie mec") overlapped_tl.add(Segment(0, 10)) overlapped_tl.add(Segment(5, 10)) overlapped_tl.add(Segment(15, 20)) overlapped_tl.add(Segment(18, 23)) expected_overlap = Timeline() expected_overlap.add(Segment(5, 10)) expected_overlap.add(Segment(18, 20)) assert expected_overlap == overlapped_tl.get_overlap()
def test_crop(timeline): selection = Segment(3, 7) expected_answer = Timeline(uri='MyAudioFile') expected_answer.add(Segment(3, 4)) expected_answer.add(Segment(5, 7)) expected_answer.add(Segment(6, 7)) assert timeline.crop(selection, mode='intersection') == expected_answer expected_answer = Timeline(uri='MyAudioFile') expected_answer.add(Segment(5, 7)) assert timeline.crop(selection, mode='strict') == expected_answer expected_answer = Timeline(uri="pouet") expected_answer.add(Segment(1, 4)) expected_answer.add(Segment(5, 7)) expected_answer.add(Segment(6, 8)) assert timeline.crop(selection, mode='loose') == expected_answer
def _get_collar(self, reference, duration): # initialize empty timeline collar = Timeline(uri=reference.uri) if duration == 0.: return collar # iterate over all segments in reference for segment in reference.itersegments(): # add collar centered on start time t = segment.start collar.add(Segment(t - .5 * duration, t + .5 * duration)) # add collar centered on end time t = segment.end collar.add(Segment(t - .5 * duration, t + .5 * duration)) # merge overlapping collars and return return collar.coverage()
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) timeline = Timeline() for segment, _ in speaker.itertracks(): timeline.add(segment) # fill gaps for gap in timeline.gaps(extent): if gap.duration < self.fill_gaps: timeline.add(gap) timeline = timeline.coverage() # dump as annotation... if self.to_annotation: annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp) # ... or as timeline else: with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def to_overlap(reference: Annotation) -> Annotation: """Get overlapped speech reference annotation Parameters ---------- reference : Annotation File yielded by pyannote.database protocols. Returns ------- overlap : `pyannote.core.Annotation` Overlapped speech reference. """ overlap = Timeline(uri=reference.uri) for (s1, t1), (s2, t2) in reference.co_iter(reference): l1 = reference[s1, t1] l2 = reference[s2, t2] if l1 == l2: continue overlap.add(s1 & s2) return overlap.support().to_annotation()
def test_crop(timeline): selection = Segment(3,7) expected_answer = Timeline(uri='MyAudioFile') expected_answer.add(Segment(3, 4)) expected_answer.add(Segment(5, 7)) expected_answer.add(Segment(6, 7)) assert timeline.crop(selection, mode='intersection') == expected_answer expected_answer = Timeline(uri='MyAudioFile') expected_answer.add(Segment(5, 7)) assert timeline.crop(selection, mode='strict') == expected_answer expected_answer = Timeline(uri="pouet") expected_answer.add(Segment(1, 4)) expected_answer.add(Segment(5, 7)) expected_answer.add(Segment(6, 8)) timeline.crop(selection, mode='loose') == expected_answer
def test_extrude(): removed = Segment(2, 5) timeline = Timeline(uri='KINGJU') timeline.add(Segment(0, 3)) timeline.add(Segment(2, 5)) timeline.add(Segment(6, 7)) expected_answer = Timeline() expected_answer.add(Segment(0, 2)) expected_answer.add(Segment(6, 7)) assert timeline.extrude(removed, mode='intersection') == expected_answer expected_answer = Timeline(uri="MCSALO") expected_answer.add(Segment(0, 3)) expected_answer.add(Segment(6, 7)) assert timeline.extrude(removed, mode='strict') == expected_answer expected_answer = Timeline(uri="CADILLAC") expected_answer.add(Segment(6, 7)) assert timeline.extrude(removed, mode='loose') == expected_answer
def apply(self, predictions): """Peak detection Parameter --------- predictions : SlidingWindowFeature Predictions returned by segmentation approaches. Returns ------- segmentation : Timeline Partition. """ y = predictions.data sw = predictions.sliding_window precision = sw.step order = int(np.rint(self.min_duration / precision)) indices = scipy.signal.argrelmax(y, order=order)[0] mini = np.nanpercentile(y, 5) maxi = np.nanpercentile(y, 95) threshold = mini + self.alpha * (maxi - mini) peak_time = np.array([sw[i].middle for i in indices if y[i] > threshold]) n_windows = len(y) start_time = sw[0].start end_time = sw[n_windows].end boundaries = np.hstack([[start_time], peak_time, [end_time]]) segmentation = Timeline() for i, (start, end) in enumerate(pairwise(boundaries)): segment = Segment(start, end) segmentation.add(segment) return segmentation
def serial_speaker_to_Annotation(serial_speaker, uri=None, modality='speaker'): """ Parameters: ----------- serial_speaker : `dict` loaded from a serial speaker JSON as defined in https://figshare.com/articles/TV_Series_Corpus/3471839 uri (uniform resource identifier) : `str`, optional which identifies the annotation (e.g. episode number) Default : None modality : `str`, optional modality of the annotation as defined in https://github.com/pyannote/pyannote-core Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the serial_speaker file Unknown speakers are not considered as annotated """ annotation = Annotation(uri, modality) not_annotated = Timeline(uri=uri) for segment in serial_speaker["data"]["speech_segments"]: time = Segment(segment["start"], segment["end"]) speaker_id = segment['speaker'].replace(" ", "_") annotation[time, speaker_id] = speaker_id if speaker_id == 'unknown': not_annotated.add(time) end = serial_speaker.get("duration", segment["end"]) annotated = not_annotated.gaps(support=Segment(0.0, end)) return annotation, annotated
def test_consistent_timelines_with_empty_segments(): # The first timeline is initialized with Segments, some empty. first_timeline = Timeline([Segment(1, 5), Segment(6, 6), Segment(7, 7), Segment(8, 10)]) # The second timeline adds one Segment at a time, including empty ones. second_timeline = Timeline() second_timeline.add(Segment(1, 5)) second_timeline.add(Segment(6, 6)) second_timeline.add(Segment(7, 7)) second_timeline.add(Segment(8, 10)) assert first_timeline == second_timeline
def apply(self, predictions, dimension=0): """ Parameters ---------- predictions : SlidingWindowFeature Must be mono-dimensional dimension : int, optional Which dimension to process """ if len(predictions.data.shape) == 1: data = predictions.data elif predictions.data.shape[1] == 1: data = predictions.data[:, 0] else: data = predictions.data[:, dimension] n_samples = predictions.getNumber() window = predictions.sliding_window timestamps = [window[i].middle for i in range(n_samples)] # initial state start = timestamps[0] label = data[0] > self.onset # timeline meant to store 'active' segments active = Timeline() for t, y in zip(timestamps[1:], data[1:]): # currently active if label: # switching from active to inactive if y < self.offset: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) start = t label = False # currently inactive else: # switching from inactive to active if y > self.onset: start = t label = True # if active at the end, add final segment if label: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) # because of padding, some 'active' segments might be overlapping # therefore, we merge those overlapping segments active = active.coverage() # remove short 'active' segments active = Timeline( [s for s in active if s.duration > self.min_duration[1]]) # fill short 'inactive' segments inactive = active.gaps() for s in inactive: if s.duration < self.min_duration[0]: active.add(s) active = active.coverage() return active
def apply(self, predictions, dimension=0): """ Parameters ---------- predictions : SlidingWindowFeature Must be mono-dimensional dimension : int, optional Which dimension to process """ if len(predictions.data.shape) == 1: data = predictions.data elif predictions.data.shape[1] == 1: data = predictions.data[:, 0] else: data = predictions.data[:, dimension] if self.log_scale: data = np.exp(data) n_samples = predictions.getNumber() window = predictions.sliding_window timestamps = [window[i].middle for i in range(n_samples)] # initial state start = timestamps[0] label = data[0] > self.onset if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(data) maxi = np.nanmax(data) elif self.scale == 'percentile': mini = np.nanpercentile(data, 1) maxi = np.nanpercentile(data, 99) onset = mini + self.onset * (maxi - mini) offset = mini + self.offset * (maxi - mini) # timeline meant to store 'active' segments active = Timeline() for t, y in zip(timestamps[1:], data[1:]): # currently active if label: # switching from active to inactive if y < offset: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) start = t label = False # currently inactive else: # switching from inactive to active if y > onset: start = t label = True # if active at the end, add final segment if label: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) # because of padding, some 'active' segments might be overlapping # therefore, we merge those overlapping segments active = active.support() # remove short 'active' segments active = Timeline( [s for s in active if s.duration > self.min_duration_on]) # fill short 'inactive' segments inactive = active.gaps() for s in inactive: if s.duration < self.min_duration_off: active.add(s) active = active.support() return active
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_precision = self.precision # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) predictions = {} references = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) # build overlap reference reference = Timeline(uri=uri) annotation = current_file['annotation'] for track1, track2 in annotation.co_iter(annotation): if track1 == track2: continue reference.add(track1[0] & track2[0]) references[uri] = reference.to_annotation() # extract overlap scores scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( np.exp(scores.data[:, 2]), scores.sliding_window) else: scores = SlidingWindowFeature( scores.data[:, 2], scores.sliding_window) predictions[uri] = scores # dichotomic search to find threshold that maximizes recall # while having at least `target_precision` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_recall = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) binarizer = Binarize(onset=current_alpha, offset=current_alpha, log_scale=False) precision = DetectionPrecision() recall = DetectionRecall() for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) reference = references[uri] hypothesis = binarizer.apply(predictions[uri], dimension=0) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) _ = precision(reference, hypothesis, uem=uem) _ = recall(reference, hypothesis, uem=uem) if abs(precision) < target_precision: # precision is not high enough: try higher thresholds lower_alpha = current_alpha else: upper_alpha = current_alpha r = abs(recall) if r > best_recall: best_recall = r best_alpha = current_alpha task = 'overlap_speech_detection' metric_name = f'{task}/recall@{target_precision:.2f}precision' return { metric_name: {'minimize': False, 'value': best_recall}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
def gecko_JSON_to_Annotation(gecko_JSON, uri=None, modality='speaker', confidence_threshold=0.0, collar=0.0, expected_min_speech_time=0.0, manual=False): """ Parameters: ----------- gecko_JSON : `dict` loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON uri (uniform resource identifier) : `str` which identifies the annotation (e.g. episode number) Default : None modality : `str` modality of the annotation as defined in https://github.com/pyannote/pyannote-core confidence_threshold : `float`, Optional. The segments with confidence under confidence_threshold won't be added to UEM file. Defaults to keep every segment (i.e. 0.0) collar: `float`, Optional. Merge tracks with same label and separated by less than `collar` seconds. Defaults to keep tracks timeline untouched (i.e. 0.0) expected_min_speech_time: `float`, Optional. Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user). Defaults to never suspect anything (i.e. 0.0) manual : `bool` Whether the json is coming from a manual correction or straight from the forced-alignment output. In the former case, the regions timing is used. `confidence_threshold` and `collar` are thus irrelevant. In the latter case (default), the timing of each term is used. Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the gecko_JSON files (depends on confidence_threshold) """ annotation = Annotation(uri, modality) not_annotated = Timeline(uri=uri) for monologue in gecko_JSON["monologues"]: if not monologue: continue # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35 speaker_ids = re.split("@|\+", monologue["speaker"]["id"]) if manual: for speaker_id in speaker_ids: # most of the time there's only one if speaker_id != '': # happens with "all@" annotation[Segment(monologue["start"], monologue["end"]), speaker_id] = speaker_id else: for i, term in enumerate(monologue["terms"]): for speaker_id in speaker_ids: # most of the time there's only one if speaker_id != '': # happens with "all@" annotation[Segment(term["start"], term["end"]), speaker_id] = speaker_id if term["confidence"] <= confidence_threshold: not_annotated.add(Segment(term["start"], term["end"])) if manual: annotated = Timeline([Segment(0.0, monologue["end"])], uri) else: annotation = annotation.support(collar) annotated = not_annotated.gaps(support=Segment(0.0, term["end"])) total_speech_time = annotation.crop(annotated).get_timeline().duration() if total_speech_time < expected_min_speech_time: warnings.warn( f"total speech time of {uri} is only {total_speech_time})") return annotation, annotated
def gecko_JSON_to_UEM(gecko_JSON, uri=None, modality='speaker', confidence_threshold=0.0, collar=0.0, expected_min_speech_time=0.0): """ Parameters: ----------- gecko_JSON : `dict` loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON uri (uniform resource identifier) : `str` which identifies the annotation (e.g. episode number) Default : None modality : `str` modality of the annotation as defined in https://github.com/pyannote/pyannote-core confidence_threshold : `float`, Optional. The segments with confidence under confidence_threshold won't be added to UEM file. Defaults to keep every segment (i.e. 0.0) collar: `float`, Optional. Merge tracks with same label and separated by less than `collar` seconds. Defaults to keep tracks timeline untouched (i.e. 0.0) expected_min_speech_time: `float`, Optional. Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user). Defaults to never suspect anything (i.e. 0.0) Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the gecko_JSON files (depends on confidence_threshold) """ annotation = Annotation(uri, modality) annotated = Timeline(uri=uri) last_confident = 0.0 last_unconfident = 0.0 for monologue in gecko_JSON["monologues"]: if not monologue: continue # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35 speaker_ids = re.split("@|\+", monologue["speaker"]["id"]) for i, term in enumerate(monologue["terms"]): term["confidence"], term["start"], term["end"] = map( float, (term.get("confidence", 0.), term["start"], term["end"])) unknown = False for speaker_id in speaker_ids: # most of the time there's only one if '#unknown#' in speaker_id: unknown = True if speaker_id != '': # happens with "all@" annotation[Segment(term["start"], term["end"]), speaker_id] = speaker_id if term["confidence"] <= confidence_threshold: last_unconfident = term["end"] else: if last_unconfident < last_confident and not unknown: annotated.add(Segment(last_confident, term["end"])) last_confident = term["start"] annotation = annotation.support(collar) total_speech_time = annotation.crop(annotated).get_timeline().duration() if total_speech_time < expected_min_speech_time: warnings.warn( f"total speech time of {uri} is only {total_speech_time})") return annotation, annotated.support()
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(focus=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(focus=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(focus=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) for segment in correct: status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'frontier' for segment in only_over: status[segment, '_'] = 'over' for segment in only_under: status[segment, '_'] = 'under' return status.smooth()
def _sliding_samples(self): uris = list(self.data_) durations = np.array([self.data_[uri]["duration"] for uri in uris]) probabilities = durations / np.sum(durations) sliding_segments = SlidingWindow( duration=self.duration, step=self.step * self.duration ) while True: np.random.shuffle(uris) # loop on all files for uri in uris: datum = self.data_[uri] # make a copy of current file current_file = dict(datum["current_file"]) # compute features for the whole file features = self.feature_extraction(current_file) # randomly shift 'annotated' segments start time so that # we avoid generating exactly the same subsequence twice annotated = Timeline() for segment in get_annotated(current_file): shifted_segment = Segment( segment.start + np.random.random() * self.duration, segment.end ) if shifted_segment: annotated.add(shifted_segment) samples = [] for sequence in sliding_segments(annotated): X = features.crop(sequence, mode="center", fixed=self.duration) y = self.crop_y(datum["y"], sequence) sample = {"X": X, "y": y} if self.mask is not None: # extract mask for current sub-segment mask = current_file[self.mask].crop( sequence, mode="center", fixed=self.duration ) # it might happen that "mask" and "y" use different # sliding windows. therefore, we simply resample "mask" # to match "y" if len(mask) != len(y): mask = scipy.signal.resample(mask, len(y), axis=0) sample["mask"] = mask for key, classes in self.file_labels_.items(): sample[key] = classes.index(current_file[key]) samples.append(sample) np.random.shuffle(samples) for sample in samples: yield sample
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(support=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(support=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(support=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) # for segment in correct: # status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'shift' for segment in only_over: status[segment, '_'] = 'over-segmentation' for segment in only_under: status[segment, '_'] = 'under-segmentation' return status.support()