def test(self, test_data, dump=False):
        n_obser = 0
        n_error = 0

        metric = GreedyDiarizationErrorRate()

        if dump:
            ofile = open(dump, "wb")

        for example in test_data:

            if len(example) == 2:
                observations, gold_labels = example
                partial_labels = [None] * len(gold_labels)
            else:
                observations, gold_labels, partial_labels = example

            predicted = self.predict(observations, partial_labels)

            if dump:
                pickle.dump((predicted, gold_labels), ofile)
            err, length = binary_loss_with_matching(predicted,
                                                    gold_labels,
                                                    return_rational=True)

            metric(*convert_labels(gold_labels, predicted))
            n_obser += length
            n_error += err

        return n_error / n_obser, abs(metric)
Beispiel #2
0
 def loss(self, item, y_pred):
     y_true = item[1]
     uri = 'tmp'
     der = GreedyDiarizationErrorRate()
     reference = self.generate_annotation(uri, y_true)
     hypothesis = self.generate_annotation(uri, y_pred)
         
     return abs(der(reference,hypothesis, uem=reference.get_timeline().extent()))
    def get_metric(self) -> GreedyDiarizationErrorRate:
        """Return new instance of diarization error rate metric"""

        # defaults to optimizing diarization error rate
        if self.purity is None:
            return GreedyDiarizationErrorRate(collar=0.0, skip_overlap=False)

        # fallbacks to using self.loss(...)
        raise NotImplementedError()
def diarization(protocol,
                subset,
                hypotheses,
                greedy=False,
                collar=0.0,
                skip_overlap=False):

    options = {
        'collar': collar,
        'skip_overlap': skip_overlap,
        'parallel': True
    }

    metrics = {
        'purity': DiarizationPurity(**options),
        'coverage': DiarizationCoverage(**options)
    }

    if greedy:
        metrics['error'] = GreedyDiarizationErrorRate(**options)
    else:
        metrics['error'] = DiarizationErrorRate(**options)

    reports = get_reports(protocol, subset, hypotheses, metrics)

    report = metrics['error'].report(display=False)
    purity = metrics['purity'].report(display=False)
    coverage = metrics['coverage'].report(display=False)

    report['purity', '%'] = purity[metrics['purity'].name, '%']
    report['coverage', '%'] = coverage[metrics['coverage'].name, '%']

    columns = list(report.columns)
    report = report[[columns[0]] + columns[-2:] + columns[1:-2]]

    report = reindex(report)

    summary = 'Diarization ({0:s}collar = {1:g} ms{2})'.format(
        'greedy, ' if greedy else '', 1000 * collar,
        ', no overlap' if skip_overlap else '')

    headers = [summary] + \
              [report.columns[i][0] for i in range(3)] + \
              ['%' if c[1] == '%' else c[0] for c in report.columns[3:]]

    print(
        tabulate(report,
                 headers=headers,
                 tablefmt="simple",
                 floatfmt=".2f",
                 numalign="decimal",
                 stralign="left",
                 missingval="",
                 showindex="default",
                 disable_numparse=False))
Beispiel #5
0
        def func(reference, hypothesis, uem=None):

            # heuristic to avoid wasting time computing DER
            # when the proposed solution is obviously wrong
            r_labels = reference.crop(uem).labels()
            h_labels = hypothesis.crop(uem).labels()
            if len(h_labels) > 100 * len(r_labels):
                return 1.

            metric = GreedyDiarizationErrorRate()
            return metric(reference, hypothesis, uem=uem)
    def fit(self, train_data, test_data, n_iter=1, prefix=None):
        learning_curve = []

        for i in range(n_iter):
            n_error = 0
            n_predicted = 0

            train_der = GreedyDiarizationErrorRate()

            shuffle(train_data)

            for example in bar(train_data):

                if len(example) == 2:
                    observations, labels = example
                    partial_labels = [None] * len(labels)
                else:
                    observations, labels, partial_labels = example

                predicted = [
                    pl for pl, state in self._infer(observations, labels,
                                                    partial_labels)
                ]
                assert len(predicted) == len(observations)

                train_der(*convert_labels(labels, predicted))
                err, length = binary_loss_with_matching(predicted,
                                                        labels,
                                                        return_rational=True)

                n_predicted += length
                n_error += err

            print("iteration {}".format(i))
            print("error: {:.2%}/{:.2%}".format(n_error / n_predicted,
                                                abs(train_der)))

            w = deepcopy(self.model.weight)
            self.model.avg()
            test_loss, test_der = self.test(
                test_data,
                dump=prefix.format(i) if prefix is not None else None)
            print("test: {:.2%}/{:.2%}".format(test_loss, test_der))

            self.model.weight = w

            learning_curve.append({
                "train_loss": n_error / n_predicted,
                "train_der": abs(train_der),
                "test_loss": test_loss,
                "test_der": test_der
            })
        return learning_curve
Beispiel #7
0
        def objective_function(params):

            metric = GreedyDiarizationErrorRate()

            covariance_type, penalty_coef, = params
            process_one_file = functools.partial(
                helper_cluster_tune,
                metric=metric,
                covariance_type=covariance_type,
                penalty_coef=penalty_coef)

            if n_jobs > 1:
                results = list(
                    pool.map(process_one_file,
                             zip(items, segmentations, features)))
            else:
                results = [
                    process_one_file(isf)
                    for isf in zip(items, segmentations, features)
                ]

            return abs(metric)
def xp_objective(args, **kwargs):
    import sys
    sys.path.append("/people/yin/projects/")
    from pyannote.database import get_protocol, get_annotated, FileFinder
    protocol = get_protocol('Etape.SpeakerDiarization.TV',
                            preprocessors={'audio': FileFinder()})

    from pyannote.metrics.diarization import GreedyDiarizationErrorRate
    metric = GreedyDiarizationErrorRate()

    from optimize_cluster import speaker_diarization
    from pyannote.audio.features import Precomputed

    feature_extraction = Precomputed(
        '/vol/work1/bredin/feature_extraction/mfcc')
    sad_pre = '/vol/work1/yin/speech_activity_detection/shallow/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.TV.development/apply'
    scd_pre = '/vol/work1/yin/speaker_change_detection/paper/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.Debug.development/apply'
    emb_pre = '/vol/work1/yin/embedding/20180124'

    args['cls__damping'] = float(args['cls__damping'])
    args['cls__preference'] = float(args['cls__preference'])

    pipeline = speaker_diarization.SpeakerDiarizationPre(
        feature_extraction, sad_pre, scd_pre, emb_pre, **args)
    try:
        for current_file in protocol.train():
            hypothesis = pipeline(current_file, annotated=True)
            if hypothesis is None:
                return 100
            reference = current_file['annotation']
            uem = get_annotated(current_file)
            metric(reference, hypothesis, uem=uem)
    except MemoryError as error:
        return 100

    return abs(metric)
 def get_metric(self) -> GreedyDiarizationErrorRate:
     """Return new instance of detection error rate metric"""
     return  GreedyDiarizationErrorRate(collar=0.0, skip_overlap=False)
    protocol = get_protocol(arguments['<database.task.protocol>'],
                            preprocessors={'audio': FileFinder()})
    subset = arguments['<subset>']

    diarization_mdtm = arguments['<diarization.mdtm>']
    parser = MDTMParser()
    annotations = parser.read(diarization_mdtm)

    diarization_res = {}
    for uri in annotations.uris:
        if uri not in diarization_res:
            diarization_res[uri] = Annotation(uri=uri)
        diarization_res[uri].update(annotations(uri=uri, modality="speaker"))

    from pyannote.metrics.diarization import GreedyDiarizationErrorRate
    metric1 = GreedyDiarizationErrorRate(parallel=False)
    metric2 = GreedyDiarizationErrorRate(parallel=False,
                                         collar=0.500,
                                         skip_overlap=True)
    metric3 = GreedyDiarizationErrorRate(parallel=False,
                                         collar=0.500,
                                         skip_overlap=False)

    from optimize_cluster import speaker_diarization
    from pyannote.audio.features import Precomputed

    file_list = []
    for current_file in getattr(protocol, subset)():
        uri = get_unique_identifier(current_file).split('/')[1]
        hypothesis = diarization_res[uri]
        reference = current_file['annotation']