def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        for subset in subsets:


            for uri, rows in subset_data.groupby('uri'):
                annotation = Annotation(uri=uri)
                for row in rows.itertuples():
                    segment = Segment(row.start, row.end)
                    annotation[segment] = row.speaker
                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,

                yield current_file
Exemple #2
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        for subset in subsets:


            for uri, rows in subset_data.groupby('uri'):
                annotation = Annotation(uri=uri)
                for row in rows.itertuples():
                    segment = Segment(row.start, row.end)
                    annotation[segment] = row.speaker
                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,

                yield current_file
Exemple #3
    def predict(self, features, min_duration=None, constraint=None):
        min_duration : float or dict, optional
            Minimum duration for each label, in seconds.

        constraint_ = self._constraint(constraint, features)
        consecutive = self._consecutive(min_duration, features)

        X = self.X(features, unknown='keep')
        sliding_window = features.sliding_window
        converted_y = self.classifier_.predict(X,

        annotation = Annotation()

        diff = list(np.where(np.diff(converted_y))[0])
        diff = [-1] + diff + [len(converted_y)]

        for t, T in pairwise(diff):
            segment = sliding_window.rangeToSegment(t, T - t)
            annotation[segment] = converted_y[t + 1]

        translation = self.label_converter_.inverse_mapping()

        return annotation.translate(translation)
Exemple #4
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        # segment                          uri                      start end  speaker      verification identification
        # A.J._Buckley/1zcIwhmdeo4_0000001 A.J._Buckley/1zcIwhmdeo4 14.7  22.8 A.J._Buckley dev          trn

        for subset in subsets:

            subset_data = data.get_group(subset)

            for uri, datum in subset_data.iterrows():

                annotation = Annotation(uri=uri)
                segment = Segment(0., datum.end - datum.start)
                annotation[segment] = datum.speaker

                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,

                yield current_file
Exemple #5
def vad_construct_pyannote_object_per_file(
    vad_table_filepath: str, groundtruth_RTTM_file: str
) -> Tuple[Annotation, Annotation]:
    Construct a Pyannote object for evaluation.
        vad_table_filepath(str) : path of vad rttm-like table.
        groundtruth_RTTM_file(str): path of groundtruth rttm file.
        reference(pyannote.Annotation): groundtruth
        hypothesis(pyannote.Annotation): prediction

    pred = pd.read_csv(vad_table_filepath, sep=" ", header=None)
    label = pd.read_csv(groundtruth_RTTM_file, sep=" ", delimiter=None, header=None)
    label = label.rename(columns={3: "start", 4: "dur", 7: "speaker"})

    # construct reference
    reference = Annotation()
    for index, row in label.iterrows():
        reference[Segment(row['start'], row['start'] + row['dur'])] = row['speaker']

    # construct hypothsis
    hypothesis = Annotation()
    for index, row in pred.iterrows():
        hypothesis[Segment(float(row[0]), float(row[0]) + float(row[1]))] = 'Speech'
    return reference, hypothesis
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        # segment                          uri                      start end  speaker      verification identification
        # A.J._Buckley/1zcIwhmdeo4_0000001 A.J._Buckley/1zcIwhmdeo4 14.7  22.8 A.J._Buckley dev          trn

        for subset in subsets:

            subset_data = data.get_group(subset)

            for uri, datum in subset_data.iterrows():

                annotation = Annotation(uri=uri)
                segment = Segment(0., datum.end - datum.start)
                annotation[segment] = datum.speaker

                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,

                yield current_file
def vad_metrics(predictions,
                window_length=int(np.floor(0.032 * 22050)),
                hop_length=int(np.floor(0.016 * 22050))):
    frame_times = librosa.frames_to_time(range(len(predictions)),
    predicted_segments = voice_segments(predictions, frame_times)

    hypothesis = Annotation()
    for seg in predicted_segments:
        hypothesis[Segment(seg[0], seg[1])] = 1

    reference = Annotation()
    for seg in reference_segments:
        reference[Segment(seg[0], seg[1])] = 1

    precision = DetectionPrecision()(reference, hypothesis)
    error = DetectionErrorRate()(reference, hypothesis)
    recall = DetectionRecall()(reference, hypothesis)
    accuracy = DetectionAccuracy()(reference, hypothesis)

    metrics = {
        "precision": precision,
        "error": error,
        "recall": recall,
        "accuracy": accuracy


    return metrics
    def predict(self, features, min_duration=None, constraint=None):
        min_duration : float or dict, optional
            Minimum duration for each label, in seconds.

        constraint_ = self._constraint(constraint, features)
        consecutive = self._consecutive(min_duration, features)

        X = self.X(features, unknown="keep")
        sliding_window = features.sliding_window
        converted_y = self.classifier_.predict(X, consecutive=consecutive, constraint=constraint_)

        annotation = Annotation()

        diff = list(np.where(np.diff(converted_y))[0])
        diff = [-1] + diff + [len(converted_y)]

        for t, T in pairwise(diff):
            segment = sliding_window.rangeToSegment(t, T - t)
            annotation[segment] = converted_y[t + 1]

        translation = self.label_converter_.inverse_mapping()

        return annotation.translate(translation)
Exemple #9
def DER(outfile, AudioDataSet, annotationlist, audioLength):
    reference = Annotation()

    if not AudioDataSet == 'DiaExample':
        treeA = ET.parse(annotationlist[0])
        rootA = treeA.getroot()
        for child in rootA.findall('segment'):
            start, end = float(child.get('transcriber_start')), float(
            reference[Segment(start, end)] = 'A'

        treeB = ET.parse(annotationlist[1])
        rootB = treeB.getroot()
        for child in rootB.findall('segment'):
            start, end = float(child.get('transcriber_start')), float(
            reference[Segment(start, end)] = 'B'

        treeC = ET.parse(annotationlist[2])
        rootC = treeC.getroot()
        for child in rootC.findall('segment'):
            start, end = float(child.get('transcriber_start')), float(
            reference[Segment(start, end)] = 'C'

        treeD = ET.parse(annotationlist[3])
        rootD = treeD.getroot()
        for child in rootD.findall('segment'):
            start, end = float(child.get('transcriber_start')), float(
            reference[Segment(start, end)] = 'D'
        reference = Annotation()
        reference[Segment(0.15, 3.41)] = 'A'
        reference[Segment(3.83, 5.82)] = 'A'
        reference[Segment(6.75, 11.10)] = 'B'
        reference[Segment(11.32, 15.8)] = 'C'
        reference[Segment(15.9, 18.8)] = 'B'
        reference[Segment(18.8, 27.8)] = 'C'
        reference[Segment(27.8, 34.4)] = 'B'
        reference[Segment(34.4, 42)] = 'D'

    hypothesis = Annotation()
    f = open(outfile, 'r')
    for line in f.readlines():
        start = float(line.split(' ')[3])
        end = start + float(line.split(' ')[4])
        annotation = line.split(' ')[5][0:-1]
        hypothesis[Segment(start, end)] = annotation
    metric = DiarizationErrorRate()
    metricPurity = DiarizationPurity()
    uem = Timeline([Segment(0, audioLength)])

    print('DER: %.2f %%' % (metric(reference, hypothesis, uem=uem) * 100))
    print('Cluster Purity: %.2f %%' %
          (metricPurity(reference, hypothesis, uem=uem) * 100))

    return metric, reference, hypothesis
Exemple #10
 def load_speaker(self, uri):
     speaker = Annotation(uri=uri)
     path = self.get_audio_path(uri)
     with open(path, 'r') as fp:
         for line in fp:
             start, duration, name, _, _ = line.strip().split()
             start = float(start)
             end = start + float(duration)
             speaker[Segment(start, end)] = name
     return speaker.smooth()
Exemple #11
    def _turn_level(self, current_file: dict,
                    speech_turns: Annotation) -> Annotation:
        """Apply clustering at speech turn level

        current_file : `dict`
            File as provided by a pyannote.database protocol.
        speech_turns : `Annotation`
            Speech turns. Should only contain `str` labels.

        hypothesis : `pyannote.core.Annotation`
            Clustering result.

        assert_string_labels(speech_turns, "speech_turns")

        embedding = self._embedding(current_file)

        labels = speech_turns.labels()
        X, clustered_labels, skipped_labels = [], [], []
        for l, label in enumerate(labels):

            timeline = speech_turns.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["strict", "center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:

            X.append(np.mean(x, axis=0))

        # apply clustering of label embeddings
        clusters = self.clustering(np.vstack(X))

        # map each clustered label to its cluster (between 1 and N_CLUSTERS)
        mapping = {label: k for label, k in zip(clustered_labels, clusters)}

        # map each skipped label to its own cluster
        # (between -1 and -N_SKIPPED_LABELS)
        for l, label in enumerate(skipped_labels):
            mapping[label] = -(l + 1)

        # do the actual mapping
        return speech_turns.rename_labels(mapping=mapping)
Exemple #12
def test_combi_categorical_dissimilarity():
    continuum = Continuum()
    annotation = Annotation()
    annotation[Segment(1, 5)] = 'Carol'
    annotation[Segment(6, 8)] = 'Bob'
    annotation[Segment(12, 18)] = 'Carol'
    annotation[Segment(7, 20)] = 'Alice'
    continuum.add_annotation('liza', annotation)
    annotation = Annotation()
    annotation[Segment(2, 6)] = 'Carol'
    annotation[Segment(7, 8)] = 'Bob'
    annotation[Segment(12, 18)] = 'Alice'
    annotation[Segment(8, 10)] = 'Alice'
    annotation[Segment(7, 19)] = 'Jeremy'
    continuum.add_annotation('pierrot', annotation)
    categories = ['Carol', 'Bob', 'Alice', 'Jeremy']

    cat = np.array([[0, 0.5, 0.3, 0.7], [0.5, 0., 0.6, 0.4],
                    [0.3, 0.6, 0., 0.7], [0.7, 0.4, 0.7, 0.]])
    combi_dis = CombinedCategoricalDissimilarity(categories=categories,
    list_dis = []
    for liza_unit in continuum['liza']:
        for pierrot_unit in continuum['pierrot']:
            unit_alignment = UnitaryAlignment(
                (("liza", liza_unit), ("pierrot", pierrot_unit)))
    assert list_dis == pytest.approx([
        0.09375, 5.11, 2.69375, 6.15, 8.790000000000001, 1.75,
        0.16666666666666666, 1.3020408163265305, 1.8, 6.3, 2.0237024221453286,
        1.4020408163265305, 0.3524, 0.8066666666666665, 0.20360110803324097,
        7.260000000000002, 7.137755102040815, 0.5166666666666666, 3.525, 0.15
    ], 0.001)

    unit_align_a = UnitaryAlignment(
        (("liza", Unit(Segment(1, 5),
                       "Carol")), ("pierrot", Unit(Segment(7, 19), "Jeremy"))))
    unit_align_b = UnitaryAlignment((
        ("pierrot", Unit(Segment(7, 19), "Jeremy")),
        ("liza", Unit(Segment(1, 5), "Carol")),
    assert (unit_align_a.compute_disorder(combi_dis) ==

    same_align = UnitaryAlignment(
        (("liza", Unit(Segment(1, 5),
                       "Carol")), ("pierrot", Unit(Segment(1, 5), "Carol"))))

    assert same_align.compute_disorder(combi_dis) == np.float32(0.0)
def test_bug_16():
    reference = Annotation()
    reference[Segment(0, 10)] = 'A'
    hypothesis = Annotation()

    metric = DiarizationErrorRate(collar=1)
    total = metric(reference, hypothesis, detailed=True)['total']
    npt.assert_almost_equal(total, 9, decimal=3)

    metric = DiarizationErrorRate(collar=0)
    total = metric(reference, hypothesis, detailed=True)['total']
    npt.assert_almost_equal(total, 10, decimal=3)
Exemple #14
 def init_annotations(self):
     ref, hyp = {}, {}
     for ivecset in self.ivecs:
         if ivecset.size() > 0:
             name =
             # dirty trick, will be removed, watch out
             if 'beamformed' in name:
                 name = re.sub('beamformed/', '', name)
             # # # # # # # # # # # # # # # # # # # # #
             name = re.sub('/.*', '', name)
             ref[name], hyp[name] = Annotation(), Annotation()
     return ref, hyp
def convert_labels(y_true, y_pred):
    reference = Annotation()
    hypothesis = Annotation()

    for i, (r, h) in enumerate(zip(y_true, y_pred)):
        segment = Segment(i, i + 1)

        if h != SILENCE:
            hypothesis[segment] = h
        if r != SILENCE:
            reference[segment] = r

    return hypothesis, reference
Exemple #16
def test_extrude():
    annotation = Annotation()
    annotation[Segment(0, 10)] = "A"
    annotation[Segment(15, 20)] = "A"
    annotation[Segment(20, 35)] = "B"
    annotation[Segment(15, 25)] = "C"
    annotation[Segment(30, 35)] = "C"

    extrusion_tl = Timeline([Segment(5, 12),
                             Segment(14, 25)])

    intersection_expected = Annotation()
    intersection_expected[Segment(0, 5)] = "A"
    intersection_expected[Segment(25, 35)] = "B"
    intersection_expected[Segment(30, 35)] = "C"

    assert (annotation.extrude(extrusion_tl, mode="intersection")

    loose_expected = Annotation()
    loose_expected[Segment(30, 35)] = "C"

    assert (annotation.extrude(extrusion_tl, mode="loose")

    strict_expected = Annotation()
    strict_expected[Segment(0, 10)] = "A"
    strict_expected[Segment(20, 35)] = "B"
    strict_expected[Segment(30, 35)] = "C"

    assert (annotation.extrude(extrusion_tl, mode="strict")
Exemple #17
def calculate_der(reference_filename, hypothesis_filename):
    lbls = Util.read_audacity_labels(reference_filename)
    reference = Annotation()
    for lbl in lbls:
        reference[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label

    predicted_lbls = Util.read_audacity_labels(hypothesis_filename)
    hypothesis = Annotation()
    for lbl in predicted_lbls:
        if lbl.label != 'non_speech':
            hypothesis[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label

    metric = DiarizationErrorRate()
    der = metric(reference, hypothesis)
    return der
Exemple #18
    def _partition(self, timeline, coverage):

        # boundaries (as set of timestamps)
        boundaries = set([])
        for segment in timeline:

        # partition (as timeline)
        partition = Annotation()
        for start, end in pairwise(sorted(boundaries)):
            segment = Segment(start, end)
            partition[segment] = '_'

        return partition.crop(coverage, mode='intersection').anonymize_tracks()
Exemple #19
def test_crop_strict(annotation):
    expected = Annotation(
    expected[Segment(5.5, 7), '_'] = 'Leonard'
    actual = annotation.crop(Segment(5, 9), mode='strict')
    assert actual == expected, str(actual)
Exemple #20
def test_from_json(annotation):
    # Check that we can reconstruct an annotation from the dict
    # returned by for_json.
    data = annotation.for_json()
    actual = Annotation.from_json(data)
    expected = annotation
    assert actual == expected
Exemple #21
def test_from_records(annotation):
    # Check that we can reconstruct an annotation from the
    # output of itertracks.
    records = annotation.itertracks(yield_label=True)
    actual = Annotation.from_records(records)
    expected = annotation
    assert actual == expected
def reference():
    reference = Annotation()
    reference[Segment(0, 5)] = 'A'
    reference[Segment(6, 10)] = 'B'
    reference[Segment(12, 14)] = 'A'
    reference[Segment(15, 20)] = 'C'
    return reference
def clip_to_annotations(clip_number, lena_mappings, human_mappings):
    """ Returns (human_annotation, lena_annotation)
    df = pd.read_csv(METADATA_PATH, index_col='ClipNumber')

    its_filename = df.loc[clip_number].ProcessingFile
    chat_filename = 'e{}.cha'.format(its_filename.split('.')[0])
    textgrid_filename = 'Clip{}.TextGrid'.format(clip_number)

    lena_dict = lena_chat_to_dict(os.path.join(CHAT_PATH, chat_filename))
    textgrid_dict = textgrid_to_dict(os.path.join(TEXTGRID_PATH, textgrid_filename))
    # remap
    lena_dict = remap(lena_dict, lena_mappings)
    textgrid_dict = remap(textgrid_dict, human_mappings)

    # set default (silence) class
    lena_annotation = dict_to_annotation(lena_dict, lena_mappings['SIL'])
    human_annotation = dict_to_annotation(textgrid_dict, human_mappings['Silence'])

    start_time = df.loc[clip_number].StartTimeS
    end_time = start_time + 300 # 5 minutes
    # The crop doesn't begin at 0, but at start_time, so we need to shift it left.
    lena_cropped = lena_annotation.crop(Segment(start_time, end_time))
    lena_annotation_shifted = Annotation()
    for segment, track, label in lena_cropped.itertracks(yield_label=True):
        shifted_segment = Segment(segment.start - start_time, segment.end - start_time)
        lena_annotation_shifted[shifted_segment, track] = label

    return human_annotation, lena_annotation_shifted
    def run(self):
        with self.in_subtitles().open('r') as fp:
            transcription = pyannote.core.json.load(fp)
        annotation = Annotation()
        label = 0
        for start, end, edge in transcription.ordered_edges_iter(data=True):
            if 'subtitle' not in edge:
            segment = Segment(start, end)
            annotation[segment] = label
            label += 1

        annotation = annotation.anonymize_labels(generator='string')

        with self.out_put().open('w') as fp:
            pyannote.core.json.dump(annotation, fp)
    def _xxx_iter(self, subset):

        data = self._load_data(subset)

        AnnotatedGroups = data['annotated'].groupby(by='uri')
        AnnotationGroups = data['annotation'].groupby(by='uri')

        for raw_uri, annotated in AnnotatedGroups:

            uri = f'{raw_uri}.Mix-Headset'

            segments = []
            for segment in annotated.itertuples():
                segments.append(Segment(start=segment.start, end=segment.end))

            annotation = Annotation(uri=uri)
            for t, turn in enumerate(
                segment = Segment(start=turn.start,
                                  end=turn.start + turn.duration)
                annotation[segment, t] = turn.speaker

            current_file = {
                'database': 'Test',
                'uri': uri,
                'annotated': Timeline(uri=uri, segments=segments),
                'annotation': annotation

            yield current_file
Exemple #26
    def _as_scores(self, raw, features, segmentation):

        if isinstance(segmentation, Timeline):
            annotation = Annotation(uri=segmentation.uri)
            for segment in segmentation:
                annotation[segment] = '?'
            segmentation = annotation

        # convert to pyannote-style & aggregate over each segment
        scores = Scores(uri=segmentation.uri,

        sliding_window = features.sliding_window

        for segment, track in segmentation.itertracks():

            # extract raw for all features in segment and aggregate
            i_start, i_duration = sliding_window.segmentToRange(segment)
            p = np.mean(raw[i_start:i_start + i_duration, :], axis=0)

            for i, label in enumerate(self.label_converter_):
                scores[segment, track, label] = p[i]

        return scores
def reference():
    reference = Annotation()
    reference[Segment(0, 10)] = 'A'
    reference[Segment(12, 20)] = 'B'
    reference[Segment(24, 27)] = 'A'
    reference[Segment(30, 40)] = 'C'
    return reference
def reference_with_overlap():
    reference = Annotation()
    reference[Segment(0, 13)] = 'A'
    reference[Segment(12, 20)] = 'B'
    reference[Segment(24, 27)] = 'A'
    reference[Segment(30, 40)] = 'C'
    return reference
Exemple #29
    def __call__(self):

        # list of chronologically sorted list of shots
        graph = self._threads_graph()
        threads = [sorted(cc) for cc in nx.connected_components(graph)]

        annotation = Annotation()
        labelGenerator = getLabelGenerator()

        # chronologically sorted threads (based on their first shot)
        for thread in sorted(threads, key=lambda thread: thread[0]):
            label = next(labelGenerator)
            for shot in thread:
                annotation[shot] = label

        return annotation.smooth()
    def _decode(
        current_file: ProtocolFile,
        hypothesis: Annotation,
        scores: SlidingWindowFeature,
        labels: Iterable,
    ) -> Annotation:

        N, K =

        if self.allow_overlap:
            active_speakers = > 0.5

            if self.lock_speech:
                active_speakers = np.argmax(, axis=1) + 1

                active_speakers = np.argmax(, axis=1)

        # reconstruct annotation
        new_hypothesis = one_hot_decoding(active_speakers,

        new_hypothesis.uri = hypothesis.uri

        if self.lock_speech:
            speech = hypothesis.get_timeline().support()
            new_hypothesis = new_hypothesis.crop(speech)

        return new_hypothesis
Exemple #31
def load_mdtm(file_mdtm):
    """Load MDTM file

    file_mdtm : `str`
        Path to MDTM file.

    annotations : `dict`
        Speaker diarization as a {uri: pyannote.core.Annotation} dictionary.

    names = ['uri', 'NA1', 'start', 'duration', 'NA2', 'NA3', 'NA4', 'speaker']
    dtype = {'uri': str, 'start': float, 'duration': float, 'speaker': str}
    data = pd.read_csv(file_mdtm, names=names, dtype=dtype,

    annotations = dict()
    for uri, turns in data.groupby('uri'):
        annotation = Annotation(uri=uri)
        for i, turn in turns.iterrows():
            segment = Segment(turn.start, turn.start + turn.duration)
            annotation[segment, i] = turn.speaker
        annotations[uri] = annotation

    return annotations
def rttm_to_annotation(input_rttm,
        Given a path to a rttm file, create the corresponding Annotation objects
        containing the triplets (t_beg, t_end, activity)

        A path to a rttm file that must exist.

        An Annotation object.
    anno = Annotation(uri=input_rttm)
    if os.path.isfile(input_rttm):
        with open(input_rttm) as fn:
            for line in fn:
                row = line.split('\t')
                t_beg, t_dur, spkr = float(row[3]), float(row[4]), row[7]
                if row[7] == "":
                    raise ValueError("Speaker role is empty in %s" %
                if class_to_keep is not None and spkr == class_to_keep:
                    # Keep only class of interest
                    anno[Segment(t_beg, t_beg + t_dur)] = spkr
                elif class_to_keep is None:
                    # Keep all classes
                    anno[Segment(t_beg, t_beg + t_dur)] = spkr
    return anno
    def preprocess(self, openface):
        openface : str
            Path to Openface features

        # TODO : option to only keep 'detections'
        # (make sure it does not alter 'starting_point' segments)

        names = ['time', 'track']
        for i in range(128):
            names += ['d{0}'.format(i)]
        data = read_table(openface,
        features = data.groupby('track')
        starting_point = Annotation(modality='face')
        for track, segment in features.apply(self._to_segment).iteritems():
            if not segment:
            starting_point[segment, track] = track

        return starting_point, features
Exemple #34
def load_mdtm(file_mdtm):
    """Load MDTM file

    file_mdtm : `str`
        Path to MDTM file.

    annotations : `dict`
        Speaker diarization as a {uri: pyannote.core.Annotation} dictionary.

    names = ["uri", "NA1", "start", "duration", "NA2", "NA3", "NA4", "speaker"]
    dtype = {"uri": str, "start": float, "duration": float, "speaker": str}
    data = pd.read_csv(

    annotations = dict()
    for uri, turns in data.groupby("uri"):
        annotation = Annotation(uri=uri)
        for i, turn in turns.iterrows():
            segment = Segment(turn.start, turn.start + turn.duration)
            annotation[segment, i] = turn.speaker
        annotations[uri] = annotation

    return annotations
Exemple #35
    def __call__(self):

        # list of chronologically sorted list of shots
        graph = self._threads_graph()
        threads = [sorted(cc) for cc in nx.connected_components(graph)]

        annotation = Annotation()
        labelGenerator = getLabelGenerator()

        # chronologically sorted threads (based on their first shot)
        for thread in sorted(threads, key=lambda thread: thread[0]):
            label = next(labelGenerator)
            for shot in thread:
                annotation[shot] = label

        return annotation.smooth()
    def _partition(self, timeline, coverage):

        # boundaries (as set of timestamps)
        boundaries = set([])
        for segment in timeline:

        # partition (as timeline)
        partition = Annotation()
        for start, end in pairwise(sorted(boundaries)):
            segment = Segment(start, end)
            partition[segment] = '_'

        cropped = partition.crop(coverage, mode='intersection')

        return partition.crop(coverage, mode='intersection').anonymize_tracks()
    def run(self):

        # wav file duration
        wav = self.in_wav().path
        with contextlib.closing(, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
        duration = frames / rate
        extent = Segment(0., duration)

        with self.in_speaker().open('r') as fp:
            speaker = pyannote.core.json.load(fp)

        timeline = Timeline()
        for segment, _ in speaker.itertracks():

        # fill gaps
        for gap in timeline.gaps(extent):
            if gap.duration < self.fill_gaps:

        timeline = timeline.coverage()

        # dump as annotation...
        if self.to_annotation:

            annotation = Annotation()
            for s, segment in enumerate(timeline):
                annotation[segment] = s
            annotation = annotation.anonymize_labels(generator='string')

            with self.out_put().open('w') as fp:
                pyannote.core.json.dump(annotation, fp)

        # ... or as timeline

            with self.out_put().open('w') as fp:
                pyannote.core.json.dump(timeline, fp)
    def trn_iter(self):

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('identification').get_group('trn')

        for uri, rows in data.groupby('uri'):
            annotation = Annotation(uri=uri)
            for row in rows.itertuples():
                segment = Segment(row.start, row.end)
                annotation[segment] = row.speaker
            annotated = annotation.get_timeline()

            current_file = {
                'uri': uri,
                'database': 'VoxCeleb',
                'annotation': annotation,
                'annotated': annotated,

            yield current_file
    def run(self):

        # wav file duration
        wav = self.in_wav().path
        with contextlib.closing(, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
        duration = frames / rate
        extent = Segment(0., duration)

        with self.in_speaker().open('r') as fp:
            speaker = pyannote.core.json.load(fp)

        segmentation = Annotation()
        for segment, _ in speaker.itertracks():
            segmentation[segment] = 'speech'
        segmentation = segmentation.smooth()

        for gap in segmentation.get_timeline().gaps(extent):
                segmentation[gap] = 'non_speech'
        segmentation = segmentation.smooth()

        with self.out_put().open('w') as fp:
            pyannote.core.json.dump(segmentation, fp)
    def iter_triplets(self, from_annotation):
        """Yield (anchor, positive, negative) segment triplets

        from_annotation : Annotation
            Annotation from which triplets are obtained.

        t = RandomTrackTriplets(per_label=self.per_label,

        annotation = Annotation(uri=from_annotation.uri,
        for segment, track, label in from_annotation.itertracks(label=True):
            if segment.duration < self.duration:
            annotation[segment, track] = label

        if len(annotation.labels()) < 2:

        triplets = t.iter_triplets(annotation)

        for triplet in triplets:

            a, p, n = [item[0] for item in triplet]

            if self.duration:
                a, p, n = [self.pick(s) for s in (a, p, n)]

            if self.yield_label:
                a_, p_, n_ = [item[2] for item in triplet]
                yield (a, a_), (p, p_), (n, n_)
                yield a, p, n
    def run(self):

        # wav file duration
        wav = self.in_wav().path
        with contextlib.closing(, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
        duration = frames / rate
        extent = Segment(0., duration)

        with self.in_subtitles().open('r') as fp:
            transcription = pyannote.core.json.load(fp)
        annotation = Annotation()
        for start, end, edge in transcription.ordered_edges_iter(data=True):
            if 'subtitle' not in edge:
            segment = Segment(start, end)
            annotation[segment] = 'speech'

        for gap in annotation.get_timeline().gaps(extent):
            annotation[gap] = 'non_speech'

        with self.out_put().open('w') as fp:
            pyannote.core.json.dump(annotation, fp)
    def regression(self, reference, before, after, uem=None, uemified=False):

        _, before, errors_before = self.difference(
            reference, before, uem=uem, uemified=True)

        reference, after, errors_after = self.difference(
            reference, after, uem=uem, uemified=True)

        behaviors = Annotation(uri=reference.uri, modality=reference.modality)

        # common (up-sampled) timeline
        common_timeline = errors_after.get_timeline().union(
        common_timeline = common_timeline.segmentation()

        # align 'before' errors on common timeline
        B = self._tagger(errors_before, common_timeline)

        # align 'after' errors on common timeline
        A = self._tagger(errors_after, common_timeline)

        for segment in common_timeline:

            old_errors = B.get_labels(segment, unique=False)
            new_errors = A.get_labels(segment, unique=False)

            n1 = len(old_errors)
            n2 = len(new_errors)
            n = max(n1, n2)

            match = np.zeros((n, n), dtype=int)
            for i1, e1 in enumerate(old_errors):
                for i2, e2 in enumerate(new_errors):
                    match[i1, i2] = self._match_errors(e1, e2)

            mapping = self.munkres.compute(2 - match)

            for i1, i2 in mapping:

                if i1 >= n1:
                    track = behaviors.new_track(segment,
                    behaviors[segment, track] = (
                        REGRESSION, None, new_errors[i2])

                elif i2 >= n2:
                    track = behaviors.new_track(segment,
                    behaviors[segment, track] = (
                        IMPROVEMENT, old_errors[i1], None)

                elif old_errors[i1][0] == MATCH_CORRECT:

                    if new_errors[i2][0] == MATCH_CORRECT:
                        track = behaviors.new_track(segment,
                        behaviors[segment, track] = (
                            BOTH_CORRECT, old_errors[i1], new_errors[i2])

                        track = behaviors.new_track(segment,
                        behaviors[segment, track] = (
                            REGRESSION, old_errors[i1], new_errors[i2])


                    if new_errors[i2][0] == MATCH_CORRECT:
                        track = behaviors.new_track(segment,
                        behaviors[segment, track] = (
                            IMPROVEMENT, old_errors[i1], new_errors[i2])

                        track = behaviors.new_track(segment,
                        behaviors[segment, track] = (
                            BOTH_INCORRECT, old_errors[i1], new_errors[i2])

        behaviors = behaviors.smooth()

        if uemified:
            return reference, before, after, behaviors
            return behaviors
        names = ['time', 'track_id', 'left', 'top', 'right', 'bottom']
        face_tracking = pd.read_table(path, delim_whitespace=True, header=None, names=names)
        pyannote_face = Annotation(uri=uri)
        for track_id, track in face_tracking.groupby('track_id'):
            start = track['time'].min()
            end = track['time'].max()
            label = mapping.get(track_id, None)
            if label is None:
                SKIP = 'Skipping track #{track_id} ({duration:d} ms) in {video_id}'
                print(SKIP.format(track_id=track_id, duration=int(1000*(end-start)), video_id=video_id))
            pyannote_face[Segment(start, end), track_id] = label

        # load names as pyannote.Annotation
        path = OCR.format(repository=REPOSITORY, uri=uri)
        names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence']
        pyannote_ocr = Annotation(uri=uri)
            ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names)
            for _, (start, end, _, _, name, _) in ocr.iterrows():
                pyannote_ocr[Segment(start, end)] = name
        except pandas.parser.CParserError as e:

        # name each person by most co-occurring OCR name
        if not pyannote_ocr:
            named_face = Annotation(uri=uri)
            named_face = argmax_tagger(pyannote_ocr, pyannote_face)
            named_face = named_face.subset(pyannote_ocr.labels())

        path = FUSION.format(repository=REPOSITORY, uri=uri)
Exemple #44
    def read(self, path, uri=None, modality=None, **kwargs):

        path : str

        modality : str, optional
            Force all entries to be considered as coming from this modality.
            Only taken into account when file format does not provide
            any field related to modality (e.g. .seg files)


        # load whole file
        df = pandas.read_table(path,
                               header=None, names=self.fields(),
                               dtype={PYANNOTE_LABEL: object})

        # remove comment lines
        # (i.e. lines for which all fields are either None or NaN)
        keep = [not all(pandas.isnull(item) for item in row[1:])
                for row in df.itertuples()]
        df = df[keep]

        # add 'segment' column build from start time & duration
        df[PYANNOTE_SEGMENT] = [self.get_segment(row)
                                for row in df.itertuples()]

        # add unique track numbers if they are not read from file
        if PYANNOTE_TRACK not in self.fields():
            df[PYANNOTE_TRACK] = range(df.shape[0])

        # add uri column in case it does not exist
        if PYANNOTE_URI not in df:
            if uri is None:
                raise ValueError('missing uri -- use uri=')
            df[PYANNOTE_URI] = uri

        # obtain list of resources
        uris = list(df[PYANNOTE_URI].unique())

        # add modality column in case it does not exist
        if PYANNOTE_MODALITY not in df:
            if modality is None:
                raise ValueError('missing modality -- use modality=')
            df[PYANNOTE_MODALITY] = modality if modality is not None else ""

        # obtain list of modalities
        modalities = list(df[PYANNOTE_MODALITY].unique())

        self._loaded = {}

        # loop on resources
        for uri in uris:

            # filter based on resource
            df_ = df[df[PYANNOTE_URI] == uri]

            # loop on modalities
            for modality in modalities:

                # filter based on modality
                modality = modality if modality is not None else ""
                df__ = df_[df_[PYANNOTE_MODALITY] == modality]
                a = Annotation.from_df(df__, modality=modality, uri=uri)
                self._loaded[uri, modality] = a

        return self
    def __call__(self, reference, hypothesis):

        if isinstance(reference, Annotation):
            reference = reference.get_timeline()

        if isinstance(hypothesis, Annotation):
            hypothesis = hypothesis.get_timeline()

        # over-segmentation
        over = Timeline(uri=reference.uri)
        prev_r = reference[0]
        intersection = []
        for r, h in reference.co_iter(hypothesis):

            if r != prev_r:
                intersection = sorted(intersection)
                for _, segment in intersection[:-1]:
                intersection = []
                prev_r = r

            segment = r & h
            intersection.append((segment.duration, segment))

        intersection = sorted(intersection)
        for _, segment in intersection[:-1]:

        # under-segmentation
        under = Timeline(uri=reference.uri)
        prev_h = hypothesis[0]
        intersection = []
        for h, r in hypothesis.co_iter(reference):

            if h != prev_h:
                intersection = sorted(intersection)
                for _, segment in intersection[:-1]:
                intersection = []
                prev_h = h

            segment = h & r
            intersection.append((segment.duration, segment))

        intersection = sorted(intersection)
        for _, segment in intersection[:-1]:

        # extent
        extent = reference.extent()

        # correct (neither under- nor over-segmented)
        correct = under.union(over).gaps(focus=extent)

        # frontier error (both under- and over-segmented)
        frontier = under.crop(over)

        # under-segmented
        not_over = over.gaps(focus=extent)
        only_under = under.crop(not_over)

        # over-segmented
        not_under = under.gaps(focus=extent)
        only_over = over.crop(not_under)

        status = Annotation(uri=reference.uri)
        for segment in correct:
            status[segment, '_'] = 'correct'
        for segment in frontier:
            status[segment, '_'] = 'frontier'
        for segment in only_over:
            status[segment, '_'] = 'over'
        for segment in only_under:
            status[segment, '_'] = 'under'

        return status.smooth()
        dic_trackID_st_to_speakingFace = {}
        for s, t, st in sd.itertracks(label=True):
            dic_trackID_st_to_speakingFace[t] = ['', thr_propagation]
        for line in open(args['<mat_speaking_face>']+'/'+videoID+'.mat').read().splitlines():
            TrackID_st, TrackID_Face, proba = line.split(' ')
            if float(proba) > dic_trackID_st_to_speakingFace[int(TrackID_st)][1]: 
                dic_trackID_st_to_speakingFace[int(TrackID_st)] = [int(TrackID_Face), float(proba)]

        trackID_face_to_name = {}
        for s, t, name in NamedSpk.itertracks(label=True):
            if dic_trackID_st_to_speakingFace[t][0] != '': 
                trackID_face_to_name[dic_trackID_st_to_speakingFace[t][0]] = name

        namedFaces = Annotation(uri=videoID)
        for s, t, faceID in faces.itertracks(label=True):
            if t in trackID_face_to_name: 
                namedFaces[s, t] = trackID_face_to_name[t]

        # write person visible and speaking in a shot:
        for sshot, tshot, shot in shots.itertracks(label=True):
            NamedSpkShot = NamedSpk.crop(sshot)
            NamedFaceShot = namedFaces.crop(sshot)
            PersonShot = set(NamedSpkShot.labels()) & set(NamedFaceShot.labels())

            for p in (PersonShot & set(evidences.keys())):
                conf = 0.0
                for sSpk in NamedSpkShot.label_timeline(p):
                    for sON, tON, name in ON.itertracks(label=True):
                        if name == p:
        uri = corpus_id + '/' + video_id

        # load shots as pyannote.Annotation
        path = SHOTS.format(repository=REPOSITORY, uri=uri)
        names = ['corpus_id', 'video_id', 'shot_id', 'start', 'end']
        dtype = {'shot_id': str}
        shots = pd.read_table(path, delim_whitespace=True, header=None, names=names, dtype=dtype)
        pyannote_shots = Annotation(uri=uri)
        for _, (_, _, shot_id, start, end) in shots.iterrows():
            pyannote_shots[Segment(start, end), shot_id] = shot_id

        # load speaker diarization as pyannote.Annotation
        path = SPEAKERS.format(repository=REPOSITORY, uri=uri)
        names = ['corpus_id', 'video_id', 'start', 'end', 'label', 'gender']
        speakers = pd.read_table(path, delim_whitespace=True, header=None, names=names)
        pyannote_speakers = Annotation(uri=uri)
        for _, (_, _, start, end, label, _) in speakers.iterrows():
            pyannote_speakers[Segment(start, end)] = label
        pyannote_speakers = pyannote_speakers.anonymize_labels(generator='int')

        # load names as pyannote.Annotation
        path = OCR.format(repository=REPOSITORY, uri=uri)
        names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence']
        pyannote_ocr = Annotation(uri=uri)
            ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names)
            for _, (start, end, _, _, name, _) in ocr.iterrows():
                pyannote_ocr[Segment(start, end)] = name
        except pandas.parser.CParserError as e:
    def difference(self, reference, hypothesis, uem=None, uemified=False):
        """Get error analysis as `Annotation`

        Labels are (status, reference_label, hypothesis_label) tuples.
        `status` is either 'correct', 'confusion', 'missed detection' or
        'false alarm'.
        `reference_label` is None in case of 'false alarm'.
        `hypothesis_label` is None in case of 'missed detection'.

        uemified : bool, optional
            Returns "uemified" version of reference and hypothesis.
            Defaults to False.

        errors : `Annotation`


        reference, hypothesis = self.uemify(
            reference, hypothesis, uem=uem, collar=self.collar)

        reference, hypothesis = self._handle_unknowns(reference, hypothesis)

        # common (up-sampled) timeline
        common_timeline = reference.get_timeline().union(
        common_timeline = common_timeline.segmentation()

        # align reference on common timeline
        R = self._tagger(reference, common_timeline)

        # translate and align hypothesis on common timeline
        H = self._tagger(hypothesis, common_timeline)

        errors = Annotation(uri=reference.uri, modality=reference.modality)

        # loop on all segments
        for segment in common_timeline:

            # list of labels in reference segment
            rlabels = R.get_labels(segment, unknown=self.unknown, unique=False)

            # list of labels in hypothesis segment
            hlabels = H.get_labels(segment, unknown=self.unknown, unique=False)

            _, details = self.matcher(rlabels, hlabels)

            for r, h in details[MATCH_CORRECT]:
                track = errors.new_track(segment, prefix=MATCH_CORRECT)
                errors[segment, track] = (MATCH_CORRECT, r, h)

            for r, h in details[MATCH_CONFUSION]:
                track = errors.new_track(segment, prefix=MATCH_CONFUSION)
                errors[segment, track] = (MATCH_CONFUSION, r, h)

            for r in details[MATCH_MISSED_DETECTION]:
                track = errors.new_track(segment,
                errors[segment, track] = (MATCH_MISSED_DETECTION, r, None)

            for h in details[MATCH_FALSE_ALARM]:
                track = errors.new_track(segment, prefix=MATCH_FALSE_ALARM)
                errors[segment, track] = (MATCH_FALSE_ALARM, None, h)

        if uemified:
            return reference, hypothesis, errors
            return errors