def test_load(sample): parser = MDTMParser() annotations = parser.read(sample) speech1 = annotations(uri="uri1", modality="speech") assert list(speech1.itertracks(label=True)) == [ (Segment(1, 3.5), 0, 'alice'), (Segment(3, 7.5), 1, 'barbara'), (Segment(6, 9), 2, 'chris') ]
def dev_iter(self): # here, you should do the same as above, but for the development set # absolute path to 'data' directory where annotations are stored data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') annotations = MDTMParser().read(op.join(data_dir, 'protocol1.dev.mdtm')) # iterate over each file in training set for uri in sorted(annotations.uris): # get annotations as pyannote.core.Annotation instance annotation = annotations(uri) # `trn_iter` (as well as `dev_iter` and `tst_iter`) are expected # to yield dictionary with the following fields: yield { # name of the database class 'database': 'ikdp', # unique file identifier 'uri': uri, # reference as pyannote.core.Annotation instance 'annotation': annotation }
def trn_iter(self): # absolute path to 'data' directory where annotations are stored data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # in this example, we assume annotations are distributed in MDTM format. # this is obviously not mandatory but pyannote.parser conveniently # provides a built-in parser for MDTM files... # Niko's comment: This can be later tested with reading ELAN files # more directly with pympi annotations = MDTMParser().read( op.join(data_dir, 'protocol1.train.mdtm')) # iterate over each file in training set for uri in sorted(annotations.uris): # get annotations as pyannote.core.Annotation instance annotation = annotations(uri) # `trn_iter` (as well as `dev_iter` and `tst_iter`) are expected # to yield dictionary with the following fields: yield { # name of the database class 'database': 'ikdp', # unique file identifier 'uri': uri, # reference as pyannote.core.Annotation instance 'annotation': annotation }
class EsterSpeakerDiarizationProtocol(SpeakerDiarizationProtocol): """Base speaker diarization protocol for ESTER database""" def __init__(self, preprocessors={}, **kwargs): super(EsterSpeakerDiarizationProtocol, self).__init__(preprocessors=preprocessors, **kwargs) self.uem_parser_ = UEMParser() self.mdtm_parser_ = MDTMParser() def _subset(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotated parts # e.g. /data/{tv|radio|all}.{train|dev|test}.uem path = op.join( data_dir, '{protocol}.{subset}.uem'.format(subset=subset, protocol=protocol)) uems = self.uem_parser_.read(path) # load annotations path = op.join( data_dir, '{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) mdtms = self.mdtm_parser_.read(path) for uri in sorted(uems.uris): annotated = uems(uri) annotation = mdtms(uri) current_file = { 'database': 'Ester', 'uri': uri, 'annotated': annotated, 'annotation': annotation } yield current_file
class LibriSpeechSpeakerRecognitionProtocol(SpeakerDiarizationProtocol): """My first speaker diarization protocol """ def __init__(self, preprocessors={}, **kwargs): super(LibriSpeechSpeakerRecognitionProtocol, self).__init__(preprocessors=preprocessors, **kwargs) self.mdtm_parser_ = MDTMParser() def _subset(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotations path = op.join( data_dir, 'librispeech-{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) mdtms = self.mdtm_parser_.read(path) for uri in sorted(mdtms.uris): annotation = mdtms(uri) current_file = { 'database': 'LibriSpeech', 'uri': uri, 'annotation': annotation, # annotated part as pyannote.core.Timeline instance 'annotated': Timeline(uri=uri, segments=[annotation.get_timeline().extent()]) } yield current_file
def tst_iter(self): # absolute path to 'data' directory where annotations are stored data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') annotations = MDTMParser().read( op.join(data_dir, 'fullref.mdtm')) # iterate over each file in training set for uri in sorted(annotations.uris): # get annotations as pyannote.core.Annotation instance annotation = annotations(uri) # `trn_iter` (as well as `dev_iter` and `tst_iter`) are expected # to yield dictionary with the following fields: yield { # name of the database class 'database': 'CallHome', # unique file identifier 'uri': uri, # reference as pyannote.core.Annotation instance 'annotation': annotation, # annotated part as pyannote.core.Timeline instance 'annotated': annotation.get_timeline().extent() }
class SwitchBoardSpeakerRecognitionProtocol(SpeakerDiarizationProtocol): """My first speaker diarization protocol """ def __init__(self, preprocessors={}, **kwargs): super(SwitchBoardSpeakerRecognitionProtocol, self).__init__(preprocessors=preprocessors, **kwargs) self.mdtm_parser_ = MDTMParser() def _subset(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotations path = op.join( data_dir, 'switchboard-{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) mdtms = self.mdtm_parser_.read(path) for uri in sorted(mdtms.uris): annotation = mdtms(uri) current_file = { 'database': 'SwitchBoard', 'uri': uri, 'annotation': annotation } yield current_file
def dev_iter(self): # here, you should do the same as above, but for the development set # absolute path to 'data' directory where annotations are stored data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # in this example, we assume annotations are distributed in MDTM format. # this is obviously not mandatory but pyannote.parser conveniently # provides a built-in parser for MDTM files... annotations = MDTMParser().read( op.join(data_dir, 'TimitSpeakerVerificationProtocol.val.mdtm')) # iterate over each file in training set for uri in sorted(annotations.uris): # get annotations as pyannote.core.Annotation instance annotation = annotations(uri) # `trn_iter` (as well as `dev_iter` and `tst_iter`) are expected # to yield dictionary with the following fields: yield { # name of the database class 'database': 'Timit', # unique file identifier 'uri': uri, # reference as pyannote.core.Annotation instance 'annotation': annotation }
def do_apply(model_pkl, features_pkl, hypothesis_mdtm, min_duration=0.250, constraint_mdtm=None): with open(model_pkl, 'rb') as f: hmm = pickle.load(f) hmm.min_duration = min_duration with open(features_pkl, 'rb') as f: features = pickle.load(f) constraint = None if constraint_mdtm: constraint = MDTMParser().read(constraint_mdtm)() hypothesis = hmm.apply(features, constraint=constraint) with open(hypothesis_mdtm, 'w') as f: MDTMParser().write(hypothesis, f=f)
class EtapeSpeakerDiarizationProtocol(SpeakerDiarizationProtocol): """Base speaker diarization protocol for ETAPE database This class should be inherited from, not used directly. Parameters ---------- preprocessors : dict or (key, preprocessor) iterable When provided, each protocol item (dictionary) are preprocessed, such that item[key] = preprocessor(**item). In case 'preprocessor' is not callable, it should be a string containing placeholder for item keys (e.g. {'wav': '/path/to/{uri}.wav'}) """ def __init__(self, preprocessors={}, **kwargs): super(EtapeSpeakerDiarizationProtocol, self).__init__(preprocessors=preprocessors, **kwargs) self.uem_parser_ = UEMParser() self.mdtm_parser_ = MDTMParser() def _subset(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotated parts # e.g. /data/{tv|radio|all}.{train|dev|test}.uem path = op.join( data_dir, '{protocol}.{subset}.uem'.format(subset=subset, protocol=protocol)) uems = self.uem_parser_.read(path) # load annotations path = op.join( data_dir, '{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) mdtms = self.mdtm_parser_.read(path) for uri in sorted(uems.uris): annotated = uems(uri) annotation = mdtms(uri) current_file = { 'database': 'Etape', 'uri': uri, 'annotated': annotated, 'annotation': annotation } yield current_file
llss = [] trials = getattr(protocol, '{subset}_trial'.format(subset=subset))() for current_trial in trials: reference = current_trial.pop('reference') hypothesis = speaker_spotting_try_diarization(current_trial) llss.append(process_trial(current_trial, hypothesis)) import simplejson as json with open(output_file, 'w') as outfile: json.dump(llss, outfile) if arguments['automatic']: from pyannote.parser import MDTMParser diarization_mdtm = arguments['<diarization.mdtm>'] parser = MDTMParser() annotations = parser.read(diarization_mdtm) REFERENCE = {} for uri_part in annotations.uris: uri = uri_part.split('_')[0] + '.Mix-Headset' if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(annotations(uri=uri_part, modality="speaker")) llss = [] trials = getattr(protocol, '{subset}_trial'.format(subset=subset))() for current_trial in trials: reference = current_trial.pop('reference')
def __init__(self, preprocessors={}, **kwargs): super(GameOfThronesSpeakerDiarizationProtocol, self).__init__(preprocessors=preprocessors, **kwargs) self.mdtm_parser_ = MDTMParser()
def __init__(self, preprocessors={}, **kwargs): super(SwitchBoardSpeakerRecognitionProtocol, self).__init__(preprocessors=preprocessors, **kwargs) self.mdtm_parser_ = MDTMParser()
import numpy as np from pyannote.database import get_protocol, FileFinder protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True) from pyannote.core import Annotation, Segment, Timeline REFERENCE = {} for current_file in protocol.development(): uri = current_file['uri'] if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(current_file['annotation']) from pyannote.parser import MDTMParser sad_dev = '/people/yin/projects/online_clustering/spotting/AMI.SpeakerSpotting.MixHeadset.development.mdtm' parser_dev = MDTMParser() annotations_dev = parser_dev.read(sad_dev) SAD = {} for item in protocol.development(): uri = item['uri'] SAD[uri] = annotations_dev(uri=uri, modality="speaker").get_timeline().support() class PyannoteFeatureExtractionError(Exception): pass class Precomputed(object): """Load precomputed features from HDF5 file Parameters
llss = [] trials = getattr(protocol, '{subset}_trial'.format(subset=subset))() for current_trial in trials: reference = current_trial.pop('reference') hypothesis = speaker_spotting_try_diarization(current_trial) llss.append(process_trial(current_trial, hypothesis)) import simplejson as json with open(output_file, 'w') as outfile: json.dump(llss, outfile) if arguments['automatic']: from pyannote.parser import MDTMParser diarization_mdtm = arguments['<diarization.mdtm>'] parser = MDTMParser() annotations = parser.read(diarization_mdtm) REFERENCE = {} for uri_part in annotations.uris: uri = uri_part.split('_')[0] + '.Mix-Headset' if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(annotations(uri=uri_part, modality="speaker")) llss = [] trials = getattr(protocol, '{subset}_trial'.format(subset=subset))() for current_trial in trials: reference = current_trial.pop('reference') hypothesis = speaker_spotting_try_diarization(current_trial)
def __init__(self, **kwargs): super(EtapeSpeakerDiarizationProtocol, self).__init__(**kwargs) self.uem_parser_ = UEMParser() self.mdtm_parser_ = MDTMParser()
# enrolment consists in summing all relevant embeddings def speaker_spotting_enrol(current_enrolment): enrol_with = current_enrolment['enrol_with'] embeddings = precomputed(current_enrolment) return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True) models = {} for current_enrolment in protocol.development_enrolment(): model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) from pyannote.parser import MDTMParser cluster_mdtm = '/people/yin/projects/online_clustering/spotting/EURECOM-online-diarization-pyannote-VAD.dev.WithOffset.mdtm' parser_dev = MDTMParser() annotations_dev = parser_dev.read(cluster_mdtm) REFERENCE = {} for uri_part in annotations_dev.uris: uri = uri_part.split('_')[0] + '.Mix-Headset' if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(annotations_dev(uri=uri_part, modality="speaker")) # Trials from pyannote.core import SlidingWindow, SlidingWindowFeature from pyannote.audio.embedding.utils import cdist
def tune(self, protocol_name, subset='development'): tune_dir = self.TUNE_DIR.format(experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) mkdir_p(tune_dir) tune_yml = self.TUNE_YML.format(tune_dir=tune_dir) tune_png = self.TUNE_PNG.format(tune_dir=tune_dir) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) items = list(getattr(protocol, subset)()) # segmentation segmentation_mdtm = self.SEGMENTATION_MDTM.format( segmentation_dir=self.segmentation_dir_, protocol=protocol_name, subset=subset) parser = MDTMParser().read(segmentation_mdtm) segmentations = [parser(item['uri']) for item in items] # features features = [self.feature_extraction_(item) for item in items] n_jobs = min(cpu_count(), len(items)) pool = Pool(n_jobs) print(n_jobs, 'jobs') def callback(res): # plot convergence import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import skopt.plots _ = skopt.plots.plot_convergence(res) plt.savefig(tune_png, dpi=75) plt.close() # save state params = { 'status': { 'objective': float(res.fun) }, 'covariance_type': str(res.x[0]), 'penalty_coef': float(res.x[1]) } with io.open(tune_yml, 'w') as fp: yaml.dump(params, fp, default_flow_style=False) def objective_function(params): metric = GreedyDiarizationErrorRate() covariance_type, penalty_coef, = params process_one_file = functools.partial( helper_cluster_tune, metric=metric, covariance_type=covariance_type, penalty_coef=penalty_coef) if n_jobs > 1: results = list( pool.map(process_one_file, zip(items, segmentations, features))) else: results = [ process_one_file(isf) for isf in zip(items, segmentations, features) ] return abs(metric) space = [ skopt.space.Categorical(['full', 'diag']), skopt.space.Real(0., 5., prior='uniform') ] res = skopt.gp_minimize(objective_function, space, random_state=1337, n_calls=20, n_random_starts=10, verbose=True, callback=callback) return {'covariance_type': str(res.x[0])}, res.fun
# enrolment consists in summing all relevant embeddings def speaker_spotting_enrol(current_enrolment): enrol_with = current_enrolment['enrol_with'] embeddings = precomputed(current_enrolment) return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True) models = {} for current_enrolment in protocol.test_enrolment(): model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) from pyannote.parser import MDTMParser cluster_mdtm = 'OD_AVAD_tst.mdtm' parser_tst = MDTMParser() annotations_tst = parser_tst.read(cluster_mdtm) REFERENCE = {} for uri_part in annotations_tst.uris: uri = uri_part.split('_')[0] + '.Mix-Headset' if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(annotations_tst(uri=uri_part, modality="speaker")) # Trials from pyannote.core import SlidingWindow, SlidingWindowFeature from pyannote.audio.embedding.utils import cdist
def __init__(self, preprocessors={}, **kwargs): super(OdessaAMISpeakerDiarizationProtocol, self).__init__( preprocessors=preprocessors, **kwargs) self.mdtm_parser_ = MDTMParser() self.uem_parser_ = UEMParser()
import numpy as np from pyannote.database import get_protocol, FileFinder protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True) from pyannote.core import Annotation,Segment, Timeline REFERENCE = {} for current_file in protocol.test(): uri = current_file['uri'] if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(current_file['annotation']) from pyannote.parser import MDTMParser sad_tst = '/people/yin/projects/online_clustering/spotting_test/AMI.SpeakerSpotting.MixHeadset.test.mdtm' parser_tst = MDTMParser() annotations_tst = parser_tst.read(sad_tst) SAD = {} for item in protocol.test(): uri = item['uri'] SAD[uri] = annotations_tst(uri=uri, modality="speaker").get_timeline().support() class PyannoteFeatureExtractionError(Exception): pass class Precomputed(object): """Load precomputed features from HDF5 file Parameters ---------- features_h5 : str Path to HDF5 file generated by script 'feature_extraction.py'.
def apply(self, protocol_name, subset='test'): apply_dir = self.APPLY_DIR.format(tune_dir=self.tune_dir_) mkdir_p(apply_dir) # load tuning results tune_yml = self.TUNE_YML.format(tune_dir=self.tune_dir_) with io.open(tune_yml, 'r') as fp: self.tune_ = yaml.load(fp) # load model for epoch 'epoch' epoch = self.tune_['epoch'] sequence_labeling = SequenceLabeling.from_disk(self.train_dir_, epoch) # initialize sequence labeling duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] aggregation = SequenceLabelingAggregation(sequence_labeling, self.feature_extraction_, duration=duration, step=step) # initialize protocol protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for i, item in enumerate(getattr(protocol, subset)()): prediction = aggregation.apply(item) if i == 0: # create metadata file at root that contains # sliding window and dimension information path = Precomputed.get_config_path(apply_dir) f = h5py.File(path) f.attrs['start'] = prediction.sliding_window.start f.attrs['duration'] = prediction.sliding_window.duration f.attrs['step'] = prediction.sliding_window.step f.attrs['dimension'] = 2 f.close() path = Precomputed.get_path(apply_dir, item) # create parent directory mkdir_p(dirname(path)) f = h5py.File(path) f.attrs['start'] = prediction.sliding_window.start f.attrs['duration'] = prediction.sliding_window.duration f.attrs['step'] = prediction.sliding_window.step f.attrs['dimension'] = 2 f.create_dataset('features', data=prediction.data) f.close() # initialize binarizer onset = self.tune_['onset'] offset = self.tune_['offset'] binarize = Binarize(onset=onset, offset=offset) precomputed = Precomputed(root_dir=apply_dir) writer = MDTMParser() path = self.HARD_MDTM.format(apply_dir=apply_dir, protocol=protocol_name, subset=subset) with io.open(path, mode='w') as gp: for item in getattr(protocol, subset)(): prediction = precomputed(item) segmentation = binarize.apply(prediction, dimension=1) writer.write(segmentation.to_annotation(), f=gp, uri=item['uri'], modality='speaker')
videos = [line.strip() for line in f.readlines()] f.close() # standard condition standard_condition = UEMParser("data/standard_condition.uem") # annotated frames annotated_frames = UEMParser("data/annotated_frames.uem") # list of anchors f = open("data/anchors.txt", "r") anchors = [line.strip() for line in f.readlines()] f.close() # manual speaker identification manual_speaker_identification = MDTMParser("data/manual_speaker.mdtm", \ multitrack=True) # -------------------------------------------------- # LOAD MONOMODAL COMPONENTS OUTPUT ON TEST SET # as described in Section "2. Monomodal Components" # -------------------------------------------------- # automatic speaker diarization auto_speaker_diarization = MDTMParser("data/auto_speaker_diarization.mdtm", \ multitrack=True) # automatic speaker identification auto_speaker_identification = \ REPEREParser("data/auto_speaker_identification.repere", \ multitrack=True, confidence=False)