def gecko_JSON_to_Annotation(gecko_JSON, uri=None, modality='speaker', confidence_threshold=0.0, collar=0.0, expected_min_speech_time=0.0, manual=False): """ Parameters: ----------- gecko_JSON : `dict` loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON uri (uniform resource identifier) : `str` which identifies the annotation (e.g. episode number) Default : None modality : `str` modality of the annotation as defined in https://github.com/pyannote/pyannote-core confidence_threshold : `float`, Optional. The segments with confidence under confidence_threshold won't be added to UEM file. Defaults to keep every segment (i.e. 0.0) collar: `float`, Optional. Merge tracks with same label and separated by less than `collar` seconds. Defaults to keep tracks timeline untouched (i.e. 0.0) expected_min_speech_time: `float`, Optional. Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user). Defaults to never suspect anything (i.e. 0.0) manual : `bool` Whether the json is coming from a manual correction or straight from the forced-alignment output. In the former case, the regions timing is used. `confidence_threshold` and `collar` are thus irrelevant. In the latter case (default), the timing of each term is used. Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the gecko_JSON files (depends on confidence_threshold) """ annotation = Annotation(uri, modality) not_annotated = Timeline(uri=uri) for monologue in gecko_JSON["monologues"]: if not monologue: continue # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35 speaker_ids = re.split("@|\+", monologue["speaker"]["id"]) if manual: for speaker_id in speaker_ids: # most of the time there's only one if speaker_id != '': # happens with "all@" annotation[Segment(monologue["start"], monologue["end"]), speaker_id] = speaker_id else: for i, term in enumerate(monologue["terms"]): for speaker_id in speaker_ids: # most of the time there's only one if speaker_id != '': # happens with "all@" annotation[Segment(term["start"], term["end"]), speaker_id] = speaker_id if term["confidence"] <= confidence_threshold: not_annotated.add(Segment(term["start"], term["end"])) if manual: annotated = Timeline([Segment(0.0, monologue["end"])], uri) else: annotation = annotation.support(collar) annotated = not_annotated.gaps(support=Segment(0.0, term["end"])) total_speech_time = annotation.crop(annotated).get_timeline().duration() if total_speech_time < expected_min_speech_time: warnings.warn( f"total speech time of {uri} is only {total_speech_time})") return annotation, annotated
def regression(self, reference, before, after, uem=None, uemified=False): _, before, errors_before = self.difference(reference, before, uem=uem, uemified=True) reference, after, errors_after = self.difference(reference, after, uem=uem, uemified=True) behaviors = Annotation(uri=reference.uri, modality=reference.modality) # common (up-sampled) timeline common_timeline = errors_after.get_timeline().union( errors_before.get_timeline()) common_timeline = common_timeline.segmentation() # align 'before' errors on common timeline B = self._tagger(errors_before, common_timeline) # align 'after' errors on common timeline A = self._tagger(errors_after, common_timeline) for segment in common_timeline: old_errors = B.get_labels(segment, unique=False) new_errors = A.get_labels(segment, unique=False) n1 = len(old_errors) n2 = len(new_errors) n = max(n1, n2) match = np.zeros((n, n), dtype=int) for i1, e1 in enumerate(old_errors): for i2, e2 in enumerate(new_errors): match[i1, i2] = self._match_errors(e1, e2) mapping = self.munkres.compute(2 - match) for i1, i2 in mapping: if i1 >= n1: track = behaviors.new_track(segment, candidate=REGRESSION, prefix=REGRESSION) behaviors[segment, track] = (REGRESSION, None, new_errors[i2]) elif i2 >= n2: track = behaviors.new_track(segment, candidate=IMPROVEMENT, prefix=IMPROVEMENT) behaviors[segment, track] = (IMPROVEMENT, old_errors[i1], None) elif old_errors[i1][0] == MATCH_CORRECT: if new_errors[i2][0] == MATCH_CORRECT: track = behaviors.new_track(segment, candidate=BOTH_CORRECT, prefix=BOTH_CORRECT) behaviors[segment, track] = (BOTH_CORRECT, old_errors[i1], new_errors[i2]) else: track = behaviors.new_track(segment, candidate=REGRESSION, prefix=REGRESSION) behaviors[segment, track] = (REGRESSION, old_errors[i1], new_errors[i2]) else: if new_errors[i2][0] == MATCH_CORRECT: track = behaviors.new_track(segment, candidate=IMPROVEMENT, prefix=IMPROVEMENT) behaviors[segment, track] = (IMPROVEMENT, old_errors[i1], new_errors[i2]) else: track = behaviors.new_track(segment, candidate=BOTH_INCORRECT, prefix=BOTH_INCORRECT) behaviors[segment, track] = (BOTH_INCORRECT, old_errors[i1], new_errors[i2]) behaviors = behaviors.support() if uemified: return reference, before, after, behaviors else: return behaviors
def gecko_JSON_to_UEM(gecko_JSON, uri=None, modality='speaker', confidence_threshold=0.0, collar=0.0, expected_min_speech_time=0.0): """ Parameters: ----------- gecko_JSON : `dict` loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON uri (uniform resource identifier) : `str` which identifies the annotation (e.g. episode number) Default : None modality : `str` modality of the annotation as defined in https://github.com/pyannote/pyannote-core confidence_threshold : `float`, Optional. The segments with confidence under confidence_threshold won't be added to UEM file. Defaults to keep every segment (i.e. 0.0) collar: `float`, Optional. Merge tracks with same label and separated by less than `collar` seconds. Defaults to keep tracks timeline untouched (i.e. 0.0) expected_min_speech_time: `float`, Optional. Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user). Defaults to never suspect anything (i.e. 0.0) Returns: -------- annotation: pyannote `Annotation` for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core annotated: pyannote `Timeline` representing the annotated parts of the gecko_JSON files (depends on confidence_threshold) """ annotation = Annotation(uri, modality) annotated = Timeline(uri=uri) last_confident = 0.0 last_unconfident = 0.0 for monologue in gecko_JSON["monologues"]: if not monologue: continue # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35 speaker_ids = re.split("@|\+", monologue["speaker"]["id"]) for i, term in enumerate(monologue["terms"]): term["confidence"], term["start"], term["end"] = map( float, (term.get("confidence", 0.), term["start"], term["end"])) unknown = False for speaker_id in speaker_ids: # most of the time there's only one if '#unknown#' in speaker_id: unknown = True if speaker_id != '': # happens with "all@" annotation[Segment(term["start"], term["end"]), speaker_id] = speaker_id if term["confidence"] <= confidence_threshold: last_unconfident = term["end"] else: if last_unconfident < last_confident and not unknown: annotated.add(Segment(last_confident, term["end"])) last_confident = term["start"] annotation = annotation.support(collar) total_speech_time = annotation.crop(annotated).get_timeline().duration() if total_speech_time < expected_min_speech_time: warnings.warn( f"total speech time of {uri} is only {total_speech_time})") return annotation, annotated.support()
p_percentile_max=0.95, init_search_step=0.01, search_level=3) icassp2018_clusterer = SpectralClusterer( min_clusters=2, max_clusters=18, autotune=None, laplacian_type=None, refinement_options=icassp2018_refinement_options, custom_dist="cosine") for idx, sample_id in enumerate(sample_ids): labels = icassp2018_clusterer.predict(sequences[idx]) print('Predicted labels: ', sample_id, f' {idx+1}/{len(sample_ids)}') annotation = Annotation() annotation.uri = sample_id for jdx, speaker_id in enumerate(labels): segment_interval = intervals[idx][jdx] annotation[Segment(segment_interval[0], segment_interval[1])] = speaker_id rttm_file = '{}/{}.rttm'.format(rttm_dir, sample_id) with open(rttm_file, 'w') as file: annotation.support().write_rttm(file) # rttm_file_collar = '{}/rttm_colar/{}.rttm'.format(rttm_dir, sample_id) # with open(rttm_file_collar, 'w') as file: # annotation.support(0.481).write_rttm(file)
def generate_annotation(self, uri, labels): res = Annotation(uri) for start, end, label in zip(range(0,len(labels)), range(1,len(labels)+1), labels): res[Segment(start,end)] = str(label) return res.support()
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(support=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(support=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(support=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) # for segment in correct: # status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'shift' for segment in only_over: status[segment, '_'] = 'over-segmentation' for segment in only_under: status[segment, '_'] = 'under-segmentation' return status.support()
def __call__( self, current_file: ProtocolFile, cannot_link: List[Tuple[float, float]] = None, must_link: List[Tuple[float, float]] = None, ) -> Annotation: """Apply speaker diarization Parameters ---------- current_file : ProtocolFile Protocol file. cannot_link : List of time-based "cannot link" constraints. must_link : List of time-based "must link" constraints. Returns ------- diarization : Annotation Speaker diarization result. """ if cannot_link is None: cannot_link = [] if must_link is None: must_link = [] if "duration" not in current_file: current_file["duration"] = get_audio_duration(current_file) # in "interactive annotation" mode, there is no need to recompute speech # regions every time a file is processed: they can be passed with the # file directly if "speech" in current_file: speech: Timeline = current_file["speech"] # in "pipeline optimization" mode, pipeline hyper-parameters are different # every time a file is processed: speech regions must be recomputed else: speech = self.compute_speech(current_file) if self.only_sad: return speech.to_annotation(generator=iter(lambda: "SPEECH", None)) # in "interactive annotation" mode, pipeline hyper-parameters are fixed. # therefore, there is no need to recompute embeddings every time a file # is processed: they can be passed with the file directly. if "embedding" in current_file: embedding: SlidingWindowFeature = current_file["embedding"] # in "pipeline optimization" mode, pipeline hyper-parameters are different # every time a file is processed: embeddings must be recomputed else: embedding = self.compute_embedding(current_file) window: SlidingWindow = embedding.sliding_window # segment_assignment[i] = s with s > 0 means that ith embedding is # strictly contained in (1-based) sth segment. # segment_assignment[i] = s with s < 0 means that more than half of ith # embedding is part of (1-based) sth segment. # segment_assignment[i] = 0 means that none of the above is true. segment_assignment: np.ndarray = self.get_segment_assignment( embedding, speech) # cluster_assignment[i] = k (k > 0) means that the ith embedding belongs # to kth cluster # cluster_assignment[i] = 0 when segment_assignment[i] = 0 cluster_assignment: np.ndarray = np.zeros((len(embedding), ), dtype=np.int32) clean = segment_assignment > 0 noisy = segment_assignment < 0 clean_indices = np.where(clean)[0] if len(clean_indices) < 2: cluster_assignment[clean_indices] = 1 else: # convert time-based constraints to index-based constraints cannot_link = index2index(time2index(cannot_link, window), clean) must_link = index2index(time2index(must_link, window), clean) dendrogram = pool( embedding[clean_indices], metric="cosine", cannot_link=cannot_link, must_link=must_link, must_link_method="propagate", ) clusters = fcluster(dendrogram, self.emb_threshold, criterion="distance") for i, k in zip(clean_indices, clusters): cluster_assignment[i] = k loose_indices = np.where(noisy)[0] if len(clean_indices) == 0: if len(loose_indices) < 2: clusters = [1] * len(loose_indices) else: dendrogram = pool(embedding[loose_indices], metric="cosine") clusters = fcluster(dendrogram, self.emb_threshold, criterion="distance") for i, k in zip(loose_indices, clusters): cluster_assignment[i] = k else: # NEAREST NEIGHBOR distance = cdist(embedding[clean_indices], embedding[loose_indices], metric="cosine") nearest_neighbor = np.argmin(distance, axis=0) for loose_index, nn in zip(loose_indices, nearest_neighbor): strict_index = clean_indices[nn] cluster_assignment[loose_index] = cluster_assignment[ strict_index] # # NEAREST CLUSTER # centroid = np.vstack( # [ # np.mean(embedding[cluster_assignment == k], axis=0) # for k in np.unique(clusters) # ] # ) # distance = cdist(centroid, embedding[loose_indices], metric="cosine") # cluster_assignment[loose_indices] = np.argmin(distance, axis=0) + 1 # convert cluster assignment to pyannote.core.Annotation # (make sure to keep speech regions unchanged) hypothesis = Annotation(uri=current_file.get("uri", None)) for s, segment in enumerate(speech): indices = np.where(segment_assignment == s + 1)[0] if len(indices) == 0: indices = np.where(segment_assignment == -(s + 1))[0] if len(indices) == 0: continue clusters = cluster_assignment[indices] start, k = segment.start, clusters[0] change_point = np.diff(clusters) != 0 for i, new_k in zip(indices[1:][change_point], clusters[1:][change_point]): end = window[i].middle + 0.5 * window.step hypothesis[Segment(start, end)] = k start = end k = new_k hypothesis[Segment(start, segment.end)] = k return hypothesis.support()