def preprocess(self, current_file, identifier=None): """Pre-compute file-wise X and y""" # extract features for the whole file # (if it has not been done already) current_file = self.periodic_preprocess(current_file, identifier=identifier) # if labels have already been extracted, do nothing if identifier in self.preprocessed_.setdefault('y', {}): return current_file # get features as pyannote.core.SlidingWindowFeature instance X = self.preprocessed_['X'][identifier] sw = X.sliding_window n_samples = X.getNumber() y = np.zeros((n_samples + 4, 1), dtype=np.int8) - 1 # [-1] ==> unknown / [0] ==> not change part / [1] ==> change part annotated = get_annotated(current_file) annotation = current_file['annotation'] segments = [] for segment, _ in annotation.itertracks(): segments.append( Segment(segment.start - self.balance, segment.start + self.balance)) segments.append( Segment(segment.end - self.balance, segment.end + self.balance)) change_part = Timeline(segments).support().crop(annotated, mode='intersection') # iterate over non-change regions for non_changes in change_part.gaps(annotated): indices = sw.crop(non_changes, mode='loose') y[indices, 0] = 0 # iterate over change regions for changes in change_part: indices = sw.crop(changes, mode='loose') y[indices, 0] = 1 y = SlidingWindowFeature(y[:-1], sw) self.preprocessed_['y'][identifier] = y return current_file
def main(): usage = "%prog [options] RTTMone RTTMtwo" desc = "Convert the txtfile from diarization of the from: \ ID t_in t_out \ into a kaldi format file for spkdet task" version = "%prog 0.1" parser = OptionParser(usage=usage, description=desc, version=version) (opt, args) = parser.parse_args() if (len(args) != 3): parser.error("Incorrect number of arguments") vadrttm, overlaprttm, outputrttm = args # Read document and loaded in memory vad = pyannote.database.util.load_rttm(vadrttm) ovl = pyannote.database.util.load_rttm(overlaprttm) fw = open(outputrttm, 'wt') for name in vad: # Examples # speech = vad['EN2002a.Mix-Headset-0000000-0006000'].get_timeline() # duration = vad['EN2002a.Mix-Headset-0000000-0006000'].get_timeline()[-1][1] # overlap = ovl['EN2002a.Mix-Headset-0000000-0006000'].get_timeline() speech = vad[name].get_timeline() duration = vad[name].get_timeline()[-1][1] if name in ovl.keys(): overlap = ovl[name].get_timeline() # just get the intersections of the VAD and overlap intersection = Timeline() for speech_segment, overlap_segment in speech.co_iter(overlap): intersection.add(speech_segment & overlap_segment) keep = intersection.gaps(support=Segment(0, duration)) vad_without_overlap = speech.crop(keep) else: vad_without_overlap = speech # Write RTTM write_rttm(fw, vad_without_overlap, label='speech') fw.close()
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) timeline = Timeline() for segment, _ in speaker.itertracks(): timeline.add(segment) # fill gaps for gap in timeline.gaps(extent): if gap.duration < self.fill_gaps: timeline.add(gap) timeline = timeline.coverage() # dump as annotation... if self.to_annotation: annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp) # ... or as timeline else: with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def remove_excluded(self): if len(self.excluded) == 0: return from pyannote.core import Segment, Timeline segments = [] for recording, _segments in self.segments.groupby( "recording_filename"): sampled = Timeline(segments=[ Segment(segment_onset, segment_offset) for segment_onset, segment_offset in _segments[ ["segment_onset", "segment_offset"]].values ]) excl_segments = self.excluded.loc[ self.excluded["recording_filename"] == recording] excl = Timeline(segments=[ Segment(segment_onset, segment_offset) for segment_onset, segment_offset in excl_segments[ ["segment_onset", "segment_offset"]].values ]) # sampled = sampled.extrude(sampled) # not released yet extent_tl = Timeline([sampled.extent()], uri=sampled.uri) truncating_support = excl.gaps(support=extent_tl) sampled = sampled.crop(truncating_support, mode="intersection") segments.append( pd.DataFrame( [[recording, s.start, s.end] for s in sampled], columns=[ "recording_filename", "segment_onset", "segment_offset" ], )) self.segments = pd.concat(segments)
def serial_speaker_to_Annotation(serial_speaker, uri=None, modality='speaker'): """ Parameters: ----------- serial_speaker : `dict` loaded from a serial speaker JSON as defined in https://figshare.com/articles/TV_Series_Corpus/3471839 uri (uniform resource identifier) : `str`, optional which identifies the annotation (e.g. episode number) Default : None modality : `str`, optional modality of the annotation as defined in https://github.com/pyannote/pyannote-core Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the serial_speaker file Unknown speakers are not considered as annotated """ annotation = Annotation(uri, modality) not_annotated = Timeline(uri=uri) for segment in serial_speaker["data"]["speech_segments"]: time = Segment(segment["start"], segment["end"]) speaker_id = segment['speaker'].replace(" ", "_") annotation[time, speaker_id] = speaker_id if speaker_id == 'unknown': not_annotated.add(time) end = serial_speaker.get("duration", segment["end"]) annotated = not_annotated.gaps(support=Segment(0.0, end)) return annotation, annotated
def apply(self, predictions, dimension=0): """ Parameters ---------- predictions : SlidingWindowFeature Must be mono-dimensional dimension : int, optional Which dimension to process """ if len(predictions.data.shape) == 1: data = predictions.data elif predictions.data.shape[1] == 1: data = predictions.data[:, 0] else: data = predictions.data[:, dimension] if self.log_scale: data = np.exp(data) n_samples = predictions.getNumber() window = predictions.sliding_window timestamps = [window[i].middle for i in range(n_samples)] # initial state start = timestamps[0] label = data[0] > self.onset if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(data) maxi = np.nanmax(data) elif self.scale == 'percentile': mini = np.nanpercentile(data, 1) maxi = np.nanpercentile(data, 99) onset = mini + self.onset * (maxi - mini) offset = mini + self.offset * (maxi - mini) # timeline meant to store 'active' segments active = Timeline() for t, y in zip(timestamps[1:], data[1:]): # currently active if label: # switching from active to inactive if y < offset: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) start = t label = False # currently inactive else: # switching from inactive to active if y > onset: start = t label = True # if active at the end, add final segment if label: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) # because of padding, some 'active' segments might be overlapping # therefore, we merge those overlapping segments active = active.support() # remove short 'active' segments active = Timeline( [s for s in active if s.duration > self.min_duration_on]) # fill short 'inactive' segments inactive = active.gaps() for s in inactive: if s.duration < self.min_duration_off: active.add(s) active = active.support() return active
def gecko_JSON_to_Annotation(gecko_JSON, uri=None, modality='speaker', confidence_threshold=0.0, collar=0.0, expected_min_speech_time=0.0, manual=False): """ Parameters: ----------- gecko_JSON : `dict` loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON uri (uniform resource identifier) : `str` which identifies the annotation (e.g. episode number) Default : None modality : `str` modality of the annotation as defined in https://github.com/pyannote/pyannote-core confidence_threshold : `float`, Optional. The segments with confidence under confidence_threshold won't be added to UEM file. Defaults to keep every segment (i.e. 0.0) collar: `float`, Optional. Merge tracks with same label and separated by less than `collar` seconds. Defaults to keep tracks timeline untouched (i.e. 0.0) expected_min_speech_time: `float`, Optional. Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user). Defaults to never suspect anything (i.e. 0.0) manual : `bool` Whether the json is coming from a manual correction or straight from the forced-alignment output. In the former case, the regions timing is used. `confidence_threshold` and `collar` are thus irrelevant. In the latter case (default), the timing of each term is used. Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the gecko_JSON files (depends on confidence_threshold) """ annotation = Annotation(uri, modality) not_annotated = Timeline(uri=uri) for monologue in gecko_JSON["monologues"]: if not monologue: continue # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35 speaker_ids = re.split("@|\+", monologue["speaker"]["id"]) if manual: for speaker_id in speaker_ids: # most of the time there's only one if speaker_id != '': # happens with "all@" annotation[Segment(monologue["start"], monologue["end"]), speaker_id] = speaker_id else: for i, term in enumerate(monologue["terms"]): for speaker_id in speaker_ids: # most of the time there's only one if speaker_id != '': # happens with "all@" annotation[Segment(term["start"], term["end"]), speaker_id] = speaker_id if term["confidence"] <= confidence_threshold: not_annotated.add(Segment(term["start"], term["end"])) if manual: annotated = Timeline([Segment(0.0, monologue["end"])], uri) else: annotation = annotation.support(collar) annotated = not_annotated.gaps(support=Segment(0.0, term["end"])) total_speech_time = annotation.crop(annotated).get_timeline().duration() if total_speech_time < expected_min_speech_time: warnings.warn( f"total speech time of {uri} is only {total_speech_time})") return annotation, annotated
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(support=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(support=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(support=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) # for segment in correct: # status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'shift' for segment in only_over: status[segment, '_'] = 'over-segmentation' for segment in only_under: status[segment, '_'] = 'under-segmentation' return status.support()
def apply(self, predictions, dimension=0): """ Parameters ---------- predictions : SlidingWindowFeature Must be mono-dimensional dimension : int, optional Which dimension to process """ if len(predictions.data.shape) == 1: data = predictions.data elif predictions.data.shape[1] == 1: data = predictions.data[:, 0] else: data = predictions.data[:, dimension] n_samples = predictions.getNumber() window = predictions.sliding_window timestamps = [window[i].middle for i in range(n_samples)] # initial state start = timestamps[0] label = data[0] > self.onset # timeline meant to store 'active' segments active = Timeline() for t, y in zip(timestamps[1:], data[1:]): # currently active if label: # switching from active to inactive if y < self.offset: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) start = t label = False # currently inactive else: # switching from inactive to active if y > self.onset: start = t label = True # if active at the end, add final segment if label: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) # because of padding, some 'active' segments might be overlapping # therefore, we merge those overlapping segments active = active.coverage() # remove short 'active' segments active = Timeline( [s for s in active if s.duration > self.min_duration[1]]) # fill short 'inactive' segments inactive = active.gaps() for s in inactive: if s.duration < self.min_duration[0]: active.add(s) active = active.coverage() return active
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(focus=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(focus=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(focus=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) for segment in correct: status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'frontier' for segment in only_over: status[segment, '_'] = 'over' for segment in only_under: status[segment, '_'] = 'under' return status.smooth()
def test_empty_gaps(): empty_timeline = Timeline(uri='MyEmptyGaps') assert list(empty_timeline.gaps()) == [] Segment.set_precision(3) assert list(empty_timeline.gaps()) == []