def gecko_JSON_to_Annotation(gecko_JSON,
                             uri=None,
                             modality='speaker',
                             confidence_threshold=0.0,
                             collar=0.0,
                             expected_min_speech_time=0.0,
                             manual=False):
    """
    Parameters:
    -----------
    gecko_JSON : `dict`
        loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON
    uri (uniform resource identifier) : `str`
        which identifies the annotation (e.g. episode number)
        Default : None
    modality : `str`
        modality of the annotation as defined in https://github.com/pyannote/pyannote-core
    confidence_threshold : `float`, Optional.
        The segments with confidence under confidence_threshold won't be added to UEM file.
        Defaults to keep every segment (i.e. 0.0)
    collar: `float`, Optional.
        Merge tracks with same label and separated by less than `collar` seconds.
        Defaults to keep tracks timeline untouched (i.e. 0.0)
    expected_min_speech_time: `float`, Optional.
        Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user).
        Defaults to never suspect anything (i.e. 0.0)
    manual : `bool`
        Whether the json is coming from a manual correction or straight from
        the forced-alignment output.
        In the former case, the regions timing is used. `confidence_threshold`
            and `collar` are thus irrelevant.
        In the latter case (default), the timing of each term is used.

    Returns:
    --------
    annotation: pyannote `Annotation`
        for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core
    annotated: pyannote `Timeline`
        representing the annotated parts of the gecko_JSON files (depends on confidence_threshold)
    """
    annotation = Annotation(uri, modality)
    not_annotated = Timeline(uri=uri)
    for monologue in gecko_JSON["monologues"]:
        if not monologue:
            continue
        # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt
        # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35
        speaker_ids = re.split("@|\+", monologue["speaker"]["id"])
        if manual:
            for speaker_id in speaker_ids:  # most of the time there's only one
                if speaker_id != '':  # happens with "all@"
                    annotation[Segment(monologue["start"], monologue["end"]),
                               speaker_id] = speaker_id
        else:
            for i, term in enumerate(monologue["terms"]):
                for speaker_id in speaker_ids:  # most of the time there's only one
                    if speaker_id != '':  # happens with "all@"
                        annotation[Segment(term["start"], term["end"]),
                                   speaker_id] = speaker_id
                if term["confidence"] <= confidence_threshold:
                    not_annotated.add(Segment(term["start"], term["end"]))

    if manual:
        annotated = Timeline([Segment(0.0, monologue["end"])], uri)
    else:
        annotation = annotation.support(collar)
        annotated = not_annotated.gaps(support=Segment(0.0, term["end"]))
    total_speech_time = annotation.crop(annotated).get_timeline().duration()
    if total_speech_time < expected_min_speech_time:
        warnings.warn(
            f"total speech time of {uri} is only {total_speech_time})")
    return annotation, annotated
Exemple #2
0
    def regression(self, reference, before, after, uem=None, uemified=False):

        _, before, errors_before = self.difference(reference,
                                                   before,
                                                   uem=uem,
                                                   uemified=True)

        reference, after, errors_after = self.difference(reference,
                                                         after,
                                                         uem=uem,
                                                         uemified=True)

        behaviors = Annotation(uri=reference.uri, modality=reference.modality)

        # common (up-sampled) timeline
        common_timeline = errors_after.get_timeline().union(
            errors_before.get_timeline())
        common_timeline = common_timeline.segmentation()

        # align 'before' errors on common timeline
        B = self._tagger(errors_before, common_timeline)

        # align 'after' errors on common timeline
        A = self._tagger(errors_after, common_timeline)

        for segment in common_timeline:

            old_errors = B.get_labels(segment, unique=False)
            new_errors = A.get_labels(segment, unique=False)

            n1 = len(old_errors)
            n2 = len(new_errors)
            n = max(n1, n2)

            match = np.zeros((n, n), dtype=int)
            for i1, e1 in enumerate(old_errors):
                for i2, e2 in enumerate(new_errors):
                    match[i1, i2] = self._match_errors(e1, e2)

            mapping = self.munkres.compute(2 - match)

            for i1, i2 in mapping:

                if i1 >= n1:
                    track = behaviors.new_track(segment,
                                                candidate=REGRESSION,
                                                prefix=REGRESSION)
                    behaviors[segment,
                              track] = (REGRESSION, None, new_errors[i2])

                elif i2 >= n2:
                    track = behaviors.new_track(segment,
                                                candidate=IMPROVEMENT,
                                                prefix=IMPROVEMENT)
                    behaviors[segment,
                              track] = (IMPROVEMENT, old_errors[i1], None)

                elif old_errors[i1][0] == MATCH_CORRECT:

                    if new_errors[i2][0] == MATCH_CORRECT:
                        track = behaviors.new_track(segment,
                                                    candidate=BOTH_CORRECT,
                                                    prefix=BOTH_CORRECT)
                        behaviors[segment,
                                  track] = (BOTH_CORRECT, old_errors[i1],
                                            new_errors[i2])

                    else:
                        track = behaviors.new_track(segment,
                                                    candidate=REGRESSION,
                                                    prefix=REGRESSION)
                        behaviors[segment,
                                  track] = (REGRESSION, old_errors[i1],
                                            new_errors[i2])

                else:

                    if new_errors[i2][0] == MATCH_CORRECT:
                        track = behaviors.new_track(segment,
                                                    candidate=IMPROVEMENT,
                                                    prefix=IMPROVEMENT)
                        behaviors[segment,
                                  track] = (IMPROVEMENT, old_errors[i1],
                                            new_errors[i2])

                    else:
                        track = behaviors.new_track(segment,
                                                    candidate=BOTH_INCORRECT,
                                                    prefix=BOTH_INCORRECT)
                        behaviors[segment,
                                  track] = (BOTH_INCORRECT, old_errors[i1],
                                            new_errors[i2])

        behaviors = behaviors.support()

        if uemified:
            return reference, before, after, behaviors
        else:
            return behaviors
def gecko_JSON_to_UEM(gecko_JSON,
                      uri=None,
                      modality='speaker',
                      confidence_threshold=0.0,
                      collar=0.0,
                      expected_min_speech_time=0.0):
    """
    Parameters:
    -----------
    gecko_JSON : `dict`
        loaded from a Gecko-compliant JSON as defined in xml_to_GeckoJSON
    uri (uniform resource identifier) : `str`
        which identifies the annotation (e.g. episode number)
        Default : None
    modality : `str`
        modality of the annotation as defined in https://github.com/pyannote/pyannote-core
    confidence_threshold : `float`, Optional.
        The segments with confidence under confidence_threshold won't be added to UEM file.
        Defaults to keep every segment (i.e. 0.0)
    collar: `float`, Optional.
        Merge tracks with same label and separated by less than `collar` seconds.
        Defaults to keep tracks timeline untouched (i.e. 0.0)
    expected_min_speech_time: `float`, Optional.
        Threshold (in seconds) under which the total duration of speech time is suspicious (warns the user).
        Defaults to never suspect anything (i.e. 0.0)

    Returns:
    --------
    annotation: pyannote `Annotation`
        for speaker identification/diarization as defined in https://github.com/pyannote/pyannote-core
    annotated: pyannote `Timeline`
        representing the annotated parts of the gecko_JSON files (depends on confidence_threshold)
    """
    annotation = Annotation(uri, modality)
    annotated = Timeline(uri=uri)
    last_confident = 0.0
    last_unconfident = 0.0
    for monologue in gecko_JSON["monologues"]:
        if not monologue:
            continue
        # '@' defined in https://github.com/hbredin/pyannote-db-plumcot/blob/develop/CONTRIBUTING.md#idepisodetxt
        # '+' defined in https://github.com/gong-io/gecko/blob/master/app/geckoModule/constants.js#L35
        speaker_ids = re.split("@|\+", monologue["speaker"]["id"])
        for i, term in enumerate(monologue["terms"]):
            term["confidence"], term["start"], term["end"] = map(
                float,
                (term.get("confidence", 0.), term["start"], term["end"]))
            unknown = False
            for speaker_id in speaker_ids:  # most of the time there's only one
                if '#unknown#' in speaker_id:
                    unknown = True
                if speaker_id != '':  # happens with "all@"
                    annotation[Segment(term["start"], term["end"]),
                               speaker_id] = speaker_id
            if term["confidence"] <= confidence_threshold:
                last_unconfident = term["end"]
            else:
                if last_unconfident < last_confident and not unknown:
                    annotated.add(Segment(last_confident, term["end"]))
                last_confident = term["start"]

    annotation = annotation.support(collar)
    total_speech_time = annotation.crop(annotated).get_timeline().duration()
    if total_speech_time < expected_min_speech_time:
        warnings.warn(
            f"total speech time of {uri} is only {total_speech_time})")
    return annotation, annotated.support()
Exemple #4
0
                    p_percentile_max=0.95,
                    init_search_step=0.01,
                    search_level=3)

icassp2018_clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=18,
    autotune=None,
    laplacian_type=None,
    refinement_options=icassp2018_refinement_options,
    custom_dist="cosine")

for idx, sample_id in enumerate(sample_ids):
    labels = icassp2018_clusterer.predict(sequences[idx])
    print('Predicted labels: ', sample_id, f' {idx+1}/{len(sample_ids)}')

    annotation = Annotation()
    annotation.uri = sample_id
    for jdx, speaker_id in enumerate(labels):
        segment_interval = intervals[idx][jdx]
        annotation[Segment(segment_interval[0],
                           segment_interval[1])] = speaker_id

    rttm_file = '{}/{}.rttm'.format(rttm_dir, sample_id)
    with open(rttm_file, 'w') as file:
        annotation.support().write_rttm(file)

    # rttm_file_collar = '{}/rttm_colar/{}.rttm'.format(rttm_dir, sample_id)
    # with open(rttm_file_collar, 'w') as file:
    #     annotation.support(0.481).write_rttm(file)
Exemple #5
0
 def generate_annotation(self, uri, labels):
     res = Annotation(uri)
     for start, end, label in zip(range(0,len(labels)), range(1,len(labels)+1), labels):
         res[Segment(start,end)] = str(label)
     return res.support()
Exemple #6
0
    def __call__(self, reference, hypothesis):

        if isinstance(reference, Annotation):
            reference = reference.get_timeline()

        if isinstance(hypothesis, Annotation):
            hypothesis = hypothesis.get_timeline()

        # over-segmentation
        over = Timeline(uri=reference.uri)
        prev_r = reference[0]
        intersection = []
        for r, h in reference.co_iter(hypothesis):

            if r != prev_r:
                intersection = sorted(intersection)
                for _, segment in intersection[:-1]:
                    over.add(segment)
                intersection = []
                prev_r = r

            segment = r & h
            intersection.append((segment.duration, segment))

        intersection = sorted(intersection)
        for _, segment in intersection[:-1]:
            over.add(segment)

        # under-segmentation
        under = Timeline(uri=reference.uri)
        prev_h = hypothesis[0]
        intersection = []
        for h, r in hypothesis.co_iter(reference):

            if h != prev_h:
                intersection = sorted(intersection)
                for _, segment in intersection[:-1]:
                    under.add(segment)
                intersection = []
                prev_h = h

            segment = h & r
            intersection.append((segment.duration, segment))

        intersection = sorted(intersection)
        for _, segment in intersection[:-1]:
            under.add(segment)

        # extent
        extent = reference.extent()

        # correct (neither under- nor over-segmented)
        correct = under.union(over).gaps(support=extent)

        # frontier error (both under- and over-segmented)
        frontier = under.crop(over)

        # under-segmented
        not_over = over.gaps(support=extent)
        only_under = under.crop(not_over)

        # over-segmented
        not_under = under.gaps(support=extent)
        only_over = over.crop(not_under)

        status = Annotation(uri=reference.uri)
        # for segment in correct:
        #     status[segment, '_'] = 'correct'
        for segment in frontier:
            status[segment, '_'] = 'shift'
        for segment in only_over:
            status[segment, '_'] = 'over-segmentation'
        for segment in only_under:
            status[segment, '_'] = 'under-segmentation'

        return status.support()
    def __call__(
        self,
        current_file: ProtocolFile,
        cannot_link: List[Tuple[float, float]] = None,
        must_link: List[Tuple[float, float]] = None,
    ) -> Annotation:
        """Apply speaker diarization

        Parameters
        ----------
        current_file : ProtocolFile
            Protocol file.
        cannot_link :
            List of time-based "cannot link" constraints.
        must_link :
            List of time-based "must link" constraints.

        Returns
        -------
        diarization : Annotation
            Speaker diarization result.
        """

        if cannot_link is None:
            cannot_link = []
        if must_link is None:
            must_link = []

        if "duration" not in current_file:
            current_file["duration"] = get_audio_duration(current_file)

        # in "interactive annotation" mode, there is no need to recompute speech
        # regions every time a file is processed: they can be passed with the
        # file directly
        if "speech" in current_file:
            speech: Timeline = current_file["speech"]

        # in "pipeline optimization" mode, pipeline hyper-parameters are different
        # every time a file is processed: speech regions must be recomputed
        else:
            speech = self.compute_speech(current_file)

        if self.only_sad:
            return speech.to_annotation(generator=iter(lambda: "SPEECH", None))

        # in "interactive annotation" mode, pipeline hyper-parameters are fixed.
        # therefore, there is no need to recompute embeddings every time a file
        # is processed: they can be passed with the file directly.
        if "embedding" in current_file:
            embedding: SlidingWindowFeature = current_file["embedding"]

        # in "pipeline optimization" mode, pipeline hyper-parameters are different
        # every time a file is processed: embeddings must be recomputed
        else:
            embedding = self.compute_embedding(current_file)

        window: SlidingWindow = embedding.sliding_window

        # segment_assignment[i] = s with s > 0 means that ith embedding is
        #       strictly contained in (1-based) sth segment.
        # segment_assignment[i] = s with s < 0 means that more than half of ith
        #       embedding is part of (1-based) sth segment.
        # segment_assignment[i] = 0 means that none of the above is true.
        segment_assignment: np.ndarray = self.get_segment_assignment(
            embedding, speech)

        # cluster_assignment[i] = k (k > 0) means that the ith embedding belongs
        #                           to kth cluster
        # cluster_assignment[i] = 0 when segment_assignment[i] = 0
        cluster_assignment: np.ndarray = np.zeros((len(embedding), ),
                                                  dtype=np.int32)

        clean = segment_assignment > 0
        noisy = segment_assignment < 0
        clean_indices = np.where(clean)[0]
        if len(clean_indices) < 2:
            cluster_assignment[clean_indices] = 1

        else:

            # convert time-based constraints to index-based constraints
            cannot_link = index2index(time2index(cannot_link, window), clean)
            must_link = index2index(time2index(must_link, window), clean)

            dendrogram = pool(
                embedding[clean_indices],
                metric="cosine",
                cannot_link=cannot_link,
                must_link=must_link,
                must_link_method="propagate",
            )
            clusters = fcluster(dendrogram,
                                self.emb_threshold,
                                criterion="distance")
            for i, k in zip(clean_indices, clusters):
                cluster_assignment[i] = k

        loose_indices = np.where(noisy)[0]
        if len(clean_indices) == 0:
            if len(loose_indices) < 2:
                clusters = [1] * len(loose_indices)
            else:
                dendrogram = pool(embedding[loose_indices], metric="cosine")
                clusters = fcluster(dendrogram,
                                    self.emb_threshold,
                                    criterion="distance")
            for i, k in zip(loose_indices, clusters):
                cluster_assignment[i] = k

        else:
            # NEAREST NEIGHBOR
            distance = cdist(embedding[clean_indices],
                             embedding[loose_indices],
                             metric="cosine")
            nearest_neighbor = np.argmin(distance, axis=0)
            for loose_index, nn in zip(loose_indices, nearest_neighbor):
                strict_index = clean_indices[nn]
                cluster_assignment[loose_index] = cluster_assignment[
                    strict_index]

            # # NEAREST CLUSTER
            # centroid = np.vstack(
            #     [
            #         np.mean(embedding[cluster_assignment == k], axis=0)
            #         for k in np.unique(clusters)
            #     ]
            # )
            # distance = cdist(centroid, embedding[loose_indices], metric="cosine")
            # cluster_assignment[loose_indices] = np.argmin(distance, axis=0) + 1

        # convert cluster assignment to pyannote.core.Annotation
        # (make sure to keep speech regions unchanged)
        hypothesis = Annotation(uri=current_file.get("uri", None))
        for s, segment in enumerate(speech):

            indices = np.where(segment_assignment == s + 1)[0]
            if len(indices) == 0:
                indices = np.where(segment_assignment == -(s + 1))[0]
                if len(indices) == 0:
                    continue

            clusters = cluster_assignment[indices]

            start, k = segment.start, clusters[0]
            change_point = np.diff(clusters) != 0
            for i, new_k in zip(indices[1:][change_point],
                                clusters[1:][change_point]):
                end = window[i].middle + 0.5 * window.step
                hypothesis[Segment(start, end)] = k
                start = end
                k = new_k
            hypothesis[Segment(start, segment.end)] = k

        return hypothesis.support()